Skip to main content

Basic Regression Workflow

Complete end-to-end example: data preparation → modeling → validation → deployment.

The 10-Step Workflow

Step 1: Prepare Data

CREATE TABLE sales_with_features AS
SELECT
month_id,
revenue,
marketing_spend,
team_size,
CASE WHEN month_id IN (11, 12) THEN 1 ELSE 0 END as holiday_season
FROM raw_sales
WHERE revenue IS NOT NULL
AND marketing_spend IS NOT NULL;

Step 2: Split Train-Test

CREATE TABLE train_data AS
SELECT * FROM sales_with_features WHERE month_id <= 36;

CREATE TABLE test_data AS
SELECT * FROM sales_with_features WHERE month_id > 36;

Step 3: Check Multicollinearity

SELECT variable, vif
FROM anofox_statistics_vif(
ARRAY['marketing_spend', 'team_size', 'holiday_season']
);
-- All VIF < 10? Continue. Otherwise, use Ridge.

Step 4: Fit OLS Model

CREATE TABLE model_results AS
SELECT
coefficient,
std_error,
t_statistic,
p_value,
r_squared,
adjusted_r_squared,
aic,
bic,
rmse
FROM anofox_statistics_ols(
'train_data',
'revenue',
ARRAY['marketing_spend', 'team_size', 'holiday_season']
);

Step 5: Validate Assumptions

-- Check normality
SELECT * FROM anofox_statistics_normality_test(residuals);

-- Check diagnostics
SELECT
y_actual,
y_predicted,
cooks_distance
FROM anofox_statistics_residual_diagnostics(actual, predicted);

Step 6: Test Significance

SELECT
'marketing_spend' as variable,
coefficient[1] as coeff,
p_value[1] as p_val,
CASE WHEN p_value[1] &lt; 0.05 THEN 'Significant' ELSE 'Not significant' END as result
FROM model_results
UNION ALL
SELECT 'team_size', coefficient[2], p_value[2], ... FROM model_results
UNION ALL
SELECT 'holiday_season', coefficient[3], p_value[3], ... FROM model_results;

Step 7: Generate Test Predictions

CREATE TABLE test_predictions AS
SELECT
test_data.*,
pred.point_estimate as predicted_revenue,
pred.lower_95,
pred.upper_95,
(test_data.revenue - pred.point_estimate) as residual
FROM test_data
CROSS JOIN LATERAL anofox_statistics_ols_predict_interval(
'train_data',
'revenue',
ARRAY['marketing_spend', 'team_size', 'holiday_season'],
test_data,
0.95
) as pred;

Step 8: Calculate Test Metrics

SELECT
SQRT(AVG(residual^2))::DECIMAL(10,2) as test_rmse,
AVG(ABS(residual)/revenue)::DECIMAL(5,4) as test_mape,
SUM(CASE WHEN revenue BETWEEN lower_95 AND upper_95 THEN 1 ELSE 0 END)
::FLOAT / COUNT(*)::FLOAT as coverage
FROM test_predictions;

Step 9: Compare to Baselines

SELECT
'Naive (mean)' as model,
SQRT(AVG((revenue - (SELECT AVG(revenue) FROM train_data))^2)) as rmse
FROM test_data
UNION ALL
SELECT
'OLS',
SQRT(AVG(residual^2))
FROM test_predictions;

Step 10: Deploy & Monitor

-- Save model for production
SELECT
coefficient,
std_error,
CURRENT_TIMESTAMP as created_at,
'v1.0' as model_version
INTO model_production
FROM model_results;

-- Monitor performance weekly
CREATE VIEW model_performance_weekly AS
SELECT
DATE_TRUNC('week', date) as week,
SQRT(AVG((revenue - prediction)^2)) as weekly_rmse,
COUNT(*) as n_observations
FROM production_predictions
GROUP BY week
ORDER BY week DESC;

Complete Copy-Paste Script

LOAD anofox_statistics;

-- Prepare
CREATE TABLE train_data AS SELECT * FROM sales WHERE month_id <= 36;
CREATE TABLE test_data AS SELECT * FROM sales WHERE month_id > 36;

-- Fit
CREATE TABLE model AS SELECT * FROM anofox_statistics_ols(
'train_data', 'revenue', ARRAY['marketing_spend', 'team_size']
);

-- Predict
CREATE TABLE predictions AS
SELECT test_data.*,
pred.point_estimate as predicted
FROM test_data
CROSS JOIN LATERAL anofox_statistics_ols_predict_interval(...) pred;

-- Evaluate
SELECT SQRT(AVG((revenue - predicted)^2))::DECIMAL(10,2) as rmse
FROM predictions;

Next Steps

🍪 Cookie Settings