Skip to main content

Utilities & Data Preparation Functions

Helper functions for data validation, cleaning, and preparation.

Data Quality & Validation

TS_QUALITY_REPORT

Generate data quality assessment

SELECT * FROM TS_QUALITY_REPORT(
'sales_data',
'date',
'sales'
);

Returns:

metric                  | value
------------------------|--------
total_observations | 365
null_count | 0
duplicate_count | 0
min_value | 50.2
max_value | 250.8
mean_value | 150.5
std_dev | 45.3
data_quality_score | 0.95

Use for:

  • Initial data exploration
  • Quality checks
  • Reporting

TS_VALIDATE_DATA

Validate data meets requirements

SELECT TS_VALIDATE_DATA(
'sales_data',
'date',
'sales',
{
'min_observations': 60,
'require_no_nulls': true,
'check_duplicates': true
}
);

Validates:

  • Minimum observations
  • No missing values
  • No duplicate dates
  • Date ordering

TS_DETECT_DUPLICATES

Find duplicate dates

SELECT date, COUNT(*) as cnt
FROM (
SELECT date FROM TS_DETECT_DUPLICATES('sales_data', 'date')
)
GROUP BY date
HAVING cnt > 1;

Returns: Duplicate date values


Data Cleaning & Gap Filling

TS_FILL_GAPS

Fill missing dates and interpolate values

SELECT * FROM TS_FILL_GAPS(
'sales_data',
'date',
'sales',
{
'method': 'linear', -- linear, forward_fill, backward_fill, mean
'max_gap': 7, -- Max gap to fill (days)
'frequency': 'day' -- day, week, month, etc
}
);

Methods:

  • 'linear': Interpolate between values
  • 'forward_fill': Repeat last value
  • 'backward_fill': Use next value
  • 'mean': Use average value

Use for:

  • Complete time series
  • Regular intervals
  • Missing dates

TS_REMOVE_OUTLIERS

Detect and handle outliers

SELECT * FROM TS_REMOVE_OUTLIERS(
'sales_data',
'sales',
{
'method': 'iqr', -- iqr, zscore, mad, isolation_forest
'multiplier': 3.0, -- Sensitivity (higher = less aggressive)
'action': 'flag' -- flag, remove, or replace
}
);

Methods:

  • 'iqr': Interquartile range (robust)
  • 'zscore': Z-score (assumes normal)
  • 'mad': Median Absolute Deviation (very robust)
  • 'isolation_forest': ML-based (finds clusters)

Actions:

  • 'flag': Mark outliers in column
  • 'remove': Drop rows
  • 'replace': Replace with interpolated value

TS_HANDLE_MISSING

Comprehensive missing value handling

SELECT * FROM TS_HANDLE_MISSING(
'sales_data',
'date',
'sales',
{
'strategy': 'interpolate', -- drop, forward_fill, mean, interpolate
'max_consecutive': 3 -- Max consecutive to handle
}
);

Transformations

TS_NORMALIZE

Scale data to 0-1 range

SELECT
date,
sales,
TS_NORMALIZE(sales) OVER () as sales_normalized
FROM sales_data;

Use for:

  • ML models requiring normalized input
  • Comparing different scales
  • Reducing influence of large values

TS_SCALE

Standardize to mean=0, std=1

SELECT
date,
sales,
TS_SCALE(sales) OVER () as sales_scaled
FROM sales_data;

Use for:

  • Z-score normalization
  • ML model preparation
  • Comparing different series

TS_DESEASONALIZE

Remove seasonal component

SELECT * FROM TS_DESEASONALIZE(
'sales_data',
'date',
'sales',
{'seasonal_period': 7}
);

Returns: Original - Seasonal component

Use for:

  • See pure trend
  • Analyze non-seasonal variation

Aggregation & Resampling

TS_AGGREGATE

Aggregate to coarser frequency

SELECT * FROM TS_AGGREGATE(
'daily_data',
'date',
'sales',
'week', -- day, week, month, quarter, year
'sum' -- sum, mean, min, max, last
);

Use for:

  • Reduce sparsity
  • Simplify patterns
  • Lower frequency analysis

TS_RESAMPLE

Change time series frequency

-- Daily to weekly
SELECT * FROM TS_RESAMPLE(
'daily_sales',
'date',
'sales',
'week'
);

Frequencies:

  • 'minute', 'hour', 'day', 'week', 'month', 'quarter', 'year'

Statistical Utilities

TS_AUTOCORR

Autocorrelation at specific lag

SELECT
TS_AUTOCORR(sales, 1) as acf_lag1,
TS_AUTOCORR(sales, 7) as acf_lag7,
TS_AUTOCORR(sales, 14) as acf_lag14
FROM sales_data;

Use for:

  • Detect seasonality
  • Find data dependencies
  • Determine differencing needed

Cross-Validation

-- Time series cross-validation
CREATE TABLE cv_results AS
WITH splits AS (
SELECT
DATE_TRUNC('month', date) as month,
ROW_NUMBER() OVER (ORDER BY date) as row_num
FROM sales_data
)
SELECT
month,
CASE
WHEN row_num <= 250 THEN 'train'
WHEN row_num <= 280 THEN 'test'
ELSE 'holdout'
END as split
FROM splits;

Data Profiling

-- Complete data profile
CREATE TABLE data_profile AS
SELECT
'Rows' as metric,
COUNT(*)::VARCHAR as value
FROM sales_data
UNION ALL
SELECT 'Date Range', MIN(date)::VARCHAR || ' to ' || MAX(date)::VARCHAR
FROM sales_data
UNION ALL
SELECT 'Nulls', COUNT(CASE WHEN sales IS NULL THEN 1 END)::VARCHAR
FROM sales_data
UNION ALL
SELECT 'Duplicates', COUNT(*) - COUNT(DISTINCT date)::VARCHAR
FROM sales_data
UNION ALL
SELECT 'Min Value', ROUND(MIN(sales), 2)::VARCHAR
FROM sales_data
UNION ALL
SELECT 'Max Value', ROUND(MAX(sales), 2)::VARCHAR
FROM sales_data
UNION ALL
SELECT 'Mean', ROUND(AVG(sales), 2)::VARCHAR
FROM sales_data
UNION ALL
SELECT 'StdDev', ROUND(STDDEV(sales), 2)::VARCHAR
FROM sales_data;

SELECT * FROM data_profile;

Data Pipeline Example

-- Complete data preparation pipeline

-- Step 1: Quality check
CREATE TABLE step1_validated AS
SELECT * FROM sales_raw
WHERE TS_VALIDATE_DATA('sales_raw', 'date', 'sales', {}) = TRUE;

-- Step 2: Fill gaps
CREATE TABLE step2_filled AS
SELECT * FROM TS_FILL_GAPS(
'step1_validated',
'date',
'sales',
{'method': 'linear', 'max_gap': 7}
);

-- Step 3: Remove outliers
CREATE TABLE step3_cleaned AS
SELECT * FROM TS_REMOVE_OUTLIERS(
'step2_filled',
'sales',
{'method': 'mad', 'multiplier': 3.0, 'action': 'replace'}
);

-- Step 4: Normalize
CREATE TABLE step4_normalized AS
SELECT
date,
TS_NORMALIZE(sales) OVER () as sales
FROM step3_cleaned;

-- Step 5: Ready for modeling
SELECT * FROM step4_normalized;

Common Utility Patterns

Verify data before forecasting

-- Pre-forecast checklist
WITH checks AS (
SELECT
'Observations ≥ 60' as check,
CASE WHEN COUNT(*)60 THEN 'PASS' ELSE 'FAIL' END as result
FROM sales_data
UNION ALL
SELECT
'No nulls',
CASE WHEN COUNT(CASE WHEN sales IS NULL THEN 1 END) = 0 THEN 'PASS' ELSE 'FAIL' END
FROM sales_data
UNION ALL
SELECT
'No duplicates',
CASE WHEN COUNT(DISTINCT date) = COUNT(*) THEN 'PASS' ELSE 'FAIL' END
FROM sales_data
)
SELECT * FROM checks;

Monitor data quality over time

-- Weekly data quality monitoring
CREATE VIEW data_quality_monitor AS
SELECT
DATE_TRUNC('week', CURRENT_DATE()) as week,
COUNT(*) as new_records,
COUNT(CASE WHEN sales IS NULL THEN 1 END) as nulls,
ROUND(AVG(sales), 2) as mean_sales,
ROUND(STDDEV(sales), 2) as std_sales
FROM sales_data
WHERE date >= CURRENT_DATE() - INTERVAL '1 week'
GROUP BY DATE_TRUNC('week', CURRENT_DATE());

Performance Tips

For large datasets:

  • Aggregate to lower frequency
  • Pre-process before storing
  • Use indexes on date columns
  • Parallelize with GROUP BY

For production:

  • Automate validation
  • Log errors and warnings
  • Monitor data quality metrics
  • Alert on issues

Next Steps


Key Takeaways

  • ✅ Always validate data before forecasting
  • ✅ Use TS_QUALITY_REPORT to understand data
  • TS_FILL_GAPS for missing dates
  • TS_REMOVE_OUTLIERS for anomalies
  • ✅ Automate validation in production
  • ✅ Monitor data quality weekly
  • ✅ Document cleaning decisions
🍪 Cookie Settings