Basic Regression Workflow
Complete end-to-end example: data preparation → modeling → validation → deployment.
The 10-Step Workflow
Step 1: Prepare Data
CREATE TABLE sales_with_features AS
SELECT
month_id,
revenue,
marketing_spend,
team_size,
CASE WHEN month_id IN (11, 12) THEN 1 ELSE 0 END as holiday_season
FROM raw_sales
WHERE revenue IS NOT NULL
AND marketing_spend IS NOT NULL;
Step 2: Split Train-Test
CREATE TABLE train_data AS
SELECT * FROM sales_with_features WHERE month_id <= 36;
CREATE TABLE test_data AS
SELECT * FROM sales_with_features WHERE month_id > 36;
Step 3: Check Multicollinearity
SELECT variable, vif
FROM anofox_statistics_vif(
ARRAY['marketing_spend', 'team_size', 'holiday_season']
);
-- All VIF < 10? Continue. Otherwise, use Ridge.
Step 4: Fit OLS Model
CREATE TABLE model_results AS
SELECT
coefficient,
std_error,
t_statistic,
p_value,
r_squared,
adjusted_r_squared,
aic,
bic,
rmse
FROM anofox_statistics_ols(
'train_data',
'revenue',
ARRAY['marketing_spend', 'team_size', 'holiday_season']
);
Step 5: Validate Assumptions
-- Check normality
SELECT * FROM anofox_statistics_normality_test(residuals);
-- Check diagnostics
SELECT
y_actual,
y_predicted,
cooks_distance
FROM anofox_statistics_residual_diagnostics(actual, predicted);
Step 6: Test Significance
SELECT
'marketing_spend' as variable,
coefficient[1] as coeff,
p_value[1] as p_val,
CASE WHEN p_value[1] < 0.05 THEN 'Significant' ELSE 'Not significant' END as result
FROM model_results
UNION ALL
SELECT 'team_size', coefficient[2], p_value[2], ... FROM model_results
UNION ALL
SELECT 'holiday_season', coefficient[3], p_value[3], ... FROM model_results;
Step 7: Generate Test Predictions
CREATE TABLE test_predictions AS
SELECT
test_data.*,
pred.point_estimate as predicted_revenue,
pred.lower_95,
pred.upper_95,
(test_data.revenue - pred.point_estimate) as residual
FROM test_data
CROSS JOIN LATERAL anofox_statistics_ols_predict_interval(
'train_data',
'revenue',
ARRAY['marketing_spend', 'team_size', 'holiday_season'],
test_data,
0.95
) as pred;
Step 8: Calculate Test Metrics
SELECT
SQRT(AVG(residual^2))::DECIMAL(10,2) as test_rmse,
AVG(ABS(residual)/revenue)::DECIMAL(5,4) as test_mape,
SUM(CASE WHEN revenue BETWEEN lower_95 AND upper_95 THEN 1 ELSE 0 END)
::FLOAT / COUNT(*)::FLOAT as coverage
FROM test_predictions;
Step 9: Compare to Baselines
SELECT
'Naive (mean)' as model,
SQRT(AVG((revenue - (SELECT AVG(revenue) FROM train_data))^2)) as rmse
FROM test_data
UNION ALL
SELECT
'OLS',
SQRT(AVG(residual^2))
FROM test_predictions;
Step 10: Deploy & Monitor
-- Save model for production
SELECT
coefficient,
std_error,
CURRENT_TIMESTAMP as created_at,
'v1.0' as model_version
INTO model_production
FROM model_results;
-- Monitor performance weekly
CREATE VIEW model_performance_weekly AS
SELECT
DATE_TRUNC('week', date) as week,
SQRT(AVG((revenue - prediction)^2)) as weekly_rmse,
COUNT(*) as n_observations
FROM production_predictions
GROUP BY week
ORDER BY week DESC;
Complete Copy-Paste Script
LOAD anofox_statistics;
-- Prepare
CREATE TABLE train_data AS SELECT * FROM sales WHERE month_id <= 36;
CREATE TABLE test_data AS SELECT * FROM sales WHERE month_id > 36;
-- Fit
CREATE TABLE model AS SELECT * FROM anofox_statistics_ols(
'train_data', 'revenue', ARRAY['marketing_spend', 'team_size']
);
-- Predict
CREATE TABLE predictions AS
SELECT test_data.*,
pred.point_estimate as predicted
FROM test_data
CROSS JOIN LATERAL anofox_statistics_ols_predict_interval(...) pred;
-- Evaluate
SELECT SQRT(AVG((revenue - predicted)^2))::DECIMAL(10,2) as rmse
FROM predictions;
Next Steps
- Grouped Analysis — Per-segment models
- Model Selection — Comparing models
- Production Deployment — Scaling