Utilities & Data Preparation Functions
Helper functions for data validation, cleaning, and preparation.
Data Quality & Validation
TS_QUALITY_REPORT
Generate data quality assessment
SELECT * FROM TS_QUALITY_REPORT(
'sales_data',
'date',
'sales'
);
Returns:
metric | value
------------------------|--------
total_observations | 365
null_count | 0
duplicate_count | 0
min_value | 50.2
max_value | 250.8
mean_value | 150.5
std_dev | 45.3
data_quality_score | 0.95
Use for:
- Initial data exploration
- Quality checks
- Reporting
TS_VALIDATE_DATA
Validate data meets requirements
SELECT TS_VALIDATE_DATA(
'sales_data',
'date',
'sales',
{
'min_observations': 60,
'require_no_nulls': true,
'check_duplicates': true
}
);
Validates:
- Minimum observations
- No missing values
- No duplicate dates
- Date ordering
TS_DETECT_DUPLICATES
Find duplicate dates
SELECT date, COUNT(*) as cnt
FROM (
SELECT date FROM TS_DETECT_DUPLICATES('sales_data', 'date')
)
GROUP BY date
HAVING cnt > 1;
Returns: Duplicate date values
Data Cleaning & Gap Filling
TS_FILL_GAPS
Fill missing dates and interpolate values
SELECT * FROM TS_FILL_GAPS(
'sales_data',
'date',
'sales',
{
'method': 'linear', -- linear, forward_fill, backward_fill, mean
'max_gap': 7, -- Max gap to fill (days)
'frequency': 'day' -- day, week, month, etc
}
);
Methods:
'linear': Interpolate between values'forward_fill': Repeat last value'backward_fill': Use next value'mean': Use average value
Use for:
- Complete time series
- Regular intervals
- Missing dates
TS_REMOVE_OUTLIERS
Detect and handle outliers
SELECT * FROM TS_REMOVE_OUTLIERS(
'sales_data',
'sales',
{
'method': 'iqr', -- iqr, zscore, mad, isolation_forest
'multiplier': 3.0, -- Sensitivity (higher = less aggressive)
'action': 'flag' -- flag, remove, or replace
}
);
Methods:
'iqr': Interquartile range (robust)'zscore': Z-score (assumes normal)'mad': Median Absolute Deviation (very robust)'isolation_forest': ML-based (finds clusters)
Actions:
'flag': Mark outliers in column'remove': Drop rows'replace': Replace with interpolated value
TS_HANDLE_MISSING
Comprehensive missing value handling
SELECT * FROM TS_HANDLE_MISSING(
'sales_data',
'date',
'sales',
{
'strategy': 'interpolate', -- drop, forward_fill, mean, interpolate
'max_consecutive': 3 -- Max consecutive to handle
}
);
Transformations
TS_NORMALIZE
Scale data to 0-1 range
SELECT
date,
sales,
TS_NORMALIZE(sales) OVER () as sales_normalized
FROM sales_data;
Use for:
- ML models requiring normalized input
- Comparing different scales
- Reducing influence of large values
TS_SCALE
Standardize to mean=0, std=1
SELECT
date,
sales,
TS_SCALE(sales) OVER () as sales_scaled
FROM sales_data;
Use for:
- Z-score normalization
- ML model preparation
- Comparing different series
TS_DESEASONALIZE
Remove seasonal component
SELECT * FROM TS_DESEASONALIZE(
'sales_data',
'date',
'sales',
{'seasonal_period': 7}
);
Returns: Original - Seasonal component
Use for:
- See pure trend
- Analyze non-seasonal variation
Aggregation & Resampling
TS_AGGREGATE
Aggregate to coarser frequency
SELECT * FROM TS_AGGREGATE(
'daily_data',
'date',
'sales',
'week', -- day, week, month, quarter, year
'sum' -- sum, mean, min, max, last
);
Use for:
- Reduce sparsity
- Simplify patterns
- Lower frequency analysis
TS_RESAMPLE
Change time series frequency
-- Daily to weekly
SELECT * FROM TS_RESAMPLE(
'daily_sales',
'date',
'sales',
'week'
);
Frequencies:
- 'minute', 'hour', 'day', 'week', 'month', 'quarter', 'year'
Statistical Utilities
TS_AUTOCORR
Autocorrelation at specific lag
SELECT
TS_AUTOCORR(sales, 1) as acf_lag1,
TS_AUTOCORR(sales, 7) as acf_lag7,
TS_AUTOCORR(sales, 14) as acf_lag14
FROM sales_data;
Use for:
- Detect seasonality
- Find data dependencies
- Determine differencing needed
Cross-Validation
-- Time series cross-validation
CREATE TABLE cv_results AS
WITH splits AS (
SELECT
DATE_TRUNC('month', date) as month,
ROW_NUMBER() OVER (ORDER BY date) as row_num
FROM sales_data
)
SELECT
month,
CASE
WHEN row_num <= 250 THEN 'train'
WHEN row_num <= 280 THEN 'test'
ELSE 'holdout'
END as split
FROM splits;
Data Profiling
-- Complete data profile
CREATE TABLE data_profile AS
SELECT
'Rows' as metric,
COUNT(*)::VARCHAR as value
FROM sales_data
UNION ALL
SELECT 'Date Range', MIN(date)::VARCHAR || ' to ' || MAX(date)::VARCHAR
FROM sales_data
UNION ALL
SELECT 'Nulls', COUNT(CASE WHEN sales IS NULL THEN 1 END)::VARCHAR
FROM sales_data
UNION ALL
SELECT 'Duplicates', COUNT(*) - COUNT(DISTINCT date)::VARCHAR
FROM sales_data
UNION ALL
SELECT 'Min Value', ROUND(MIN(sales), 2)::VARCHAR
FROM sales_data
UNION ALL
SELECT 'Max Value', ROUND(MAX(sales), 2)::VARCHAR
FROM sales_data
UNION ALL
SELECT 'Mean', ROUND(AVG(sales), 2)::VARCHAR
FROM sales_data
UNION ALL
SELECT 'StdDev', ROUND(STDDEV(sales), 2)::VARCHAR
FROM sales_data;
SELECT * FROM data_profile;
Data Pipeline Example
-- Complete data preparation pipeline
-- Step 1: Quality check
CREATE TABLE step1_validated AS
SELECT * FROM sales_raw
WHERE TS_VALIDATE_DATA('sales_raw', 'date', 'sales', {}) = TRUE;
-- Step 2: Fill gaps
CREATE TABLE step2_filled AS
SELECT * FROM TS_FILL_GAPS(
'step1_validated',
'date',
'sales',
{'method': 'linear', 'max_gap': 7}
);
-- Step 3: Remove outliers
CREATE TABLE step3_cleaned AS
SELECT * FROM TS_REMOVE_OUTLIERS(
'step2_filled',
'sales',
{'method': 'mad', 'multiplier': 3.0, 'action': 'replace'}
);
-- Step 4: Normalize
CREATE TABLE step4_normalized AS
SELECT
date,
TS_NORMALIZE(sales) OVER () as sales
FROM step3_cleaned;
-- Step 5: Ready for modeling
SELECT * FROM step4_normalized;
Common Utility Patterns
Verify data before forecasting
-- Pre-forecast checklist
WITH checks AS (
SELECT
'Observations ≥ 60' as check,
CASE WHEN COUNT(*) ≥ 60 THEN 'PASS' ELSE 'FAIL' END as result
FROM sales_data
UNION ALL
SELECT
'No nulls',
CASE WHEN COUNT(CASE WHEN sales IS NULL THEN 1 END) = 0 THEN 'PASS' ELSE 'FAIL' END
FROM sales_data
UNION ALL
SELECT
'No duplicates',
CASE WHEN COUNT(DISTINCT date) = COUNT(*) THEN 'PASS' ELSE 'FAIL' END
FROM sales_data
)
SELECT * FROM checks;
Monitor data quality over time
-- Weekly data quality monitoring
CREATE VIEW data_quality_monitor AS
SELECT
DATE_TRUNC('week', CURRENT_DATE()) as week,
COUNT(*) as new_records,
COUNT(CASE WHEN sales IS NULL THEN 1 END) as nulls,
ROUND(AVG(sales), 2) as mean_sales,
ROUND(STDDEV(sales), 2) as std_sales
FROM sales_data
WHERE date >= CURRENT_DATE() - INTERVAL '1 week'
GROUP BY DATE_TRUNC('week', CURRENT_DATE());
Performance Tips
For large datasets:
- Aggregate to lower frequency
- Pre-process before storing
- Use indexes on date columns
- Parallelize with GROUP BY
For production:
- Automate validation
- Log errors and warnings
- Monitor data quality metrics
- Alert on issues
Next Steps
- Data Preparation Guide — Practical cleaning examples
- Production Deployment — Automated monitoring
- Diagnostics Reference — Pattern detection
Key Takeaways
- ✅ Always validate data before forecasting
- ✅ Use
TS_QUALITY_REPORTto understand data - ✅
TS_FILL_GAPSfor missing dates - ✅
TS_REMOVE_OUTLIERSfor anomalies - ✅ Automate validation in production
- ✅ Monitor data quality weekly
- ✅ Document cleaning decisions