Basic Workflow
Complete end-to-end data validation pipeline from raw data to validation gates.
The Complete Pipeline
Raw Data → Profile → Validate → Separate → Report → Archive
Step 1: Load Raw Data
Create a table with customer data (potentially dirty):
CREATE TABLE raw_customers AS
SELECT
1 as customer_id, 'john.doe@example.com' as email, '555-0123' as phone, 'DE123456789' as vat_id, 1500.50 as order_amount, now() as created_at
UNION ALL
SELECT 2, 'bad.email@', '(555) 0123', 'INVALID_VAT', -200.00, now() - interval '60 days'
UNION ALL
SELECT 3, 'jane@company.org', '+1-555-0124', 'FR12345678', 0.00, now() - interval '90 days'
UNION ALL
SELECT 4, 'bob.smith@domain.co.uk', '+44-20-7946-0958', 'GB123456789', 5000.00, now()
UNION ALL
SELECT 5, 'alice@test', '+1 555 0125', 'ES12345678A', 250.75, now() - interval '30 days';
Step 2: Profile the Data
Understand what you're working with:
SELECT
COUNT(*) as total_records,
COUNT(*) FILTER (WHERE email IS NOT NULL) as email_count,
COUNT(*) FILTER (WHERE phone IS NOT NULL) as phone_count,
COUNT(*) FILTER (WHERE vat_id IS NOT NULL) as vat_count,
COUNT(DISTINCT customer_id) as unique_customers,
MIN(created_at) as oldest_record,
MAX(created_at) as newest_record,
ROUND(100.0 * COUNT(*) FILTER (WHERE created_at > now() - interval '30 days') / COUNT(*), 1) as created_last_30days_pct
FROM raw_customers;
Output:
total_records: 5
email_count: 5
phone_count: 5
vat_count: 5
unique_customers: 5
oldest_record: 90 days ago
newest_record: now
created_last_30days_pct: 60.0
Step 3: Validate Each Field
Email Validation
CREATE TABLE validated_emails AS
SELECT
customer_id,
email,
anofox_email_validate(email, 'regex') as email_regex_valid,
anofox_email_validate(email, 'dns') as email_dns_valid,
CASE
WHEN anofox_email_validate(email, 'dns') IS TRUE THEN 'PASS'
ELSE 'FAIL'
END as email_validation_status
FROM raw_customers;
Phone Validation
CREATE TABLE validated_phones AS
SELECT
customer_id,
phone,
anofox_phone_format(phone, 'US') as phone_formatted,
CASE
WHEN anofox_phone_format(phone, 'US') IS NOT NULL THEN 'PASS'
ELSE 'FAIL'
END as phone_validation_status
FROM raw_customers;
VAT Validation
CREATE TABLE validated_vat AS
SELECT
customer_id,
vat_id,
anofox_vat_is_valid(vat_id) as vat_valid,
anofox_vat_get_country(vat_id) as vat_country,
CASE
WHEN anofox_vat_is_valid(vat_id) IS TRUE THEN 'PASS'
ELSE 'FAIL'
END as vat_validation_status
FROM raw_customers;
Amount Validation
CREATE TABLE validated_amounts AS
SELECT
customer_id,
order_amount,
anofox_money_is_positive(order_amount) as amount_positive,
anofox_money_is_valid_amount(order_amount, 0.01, 999999.99) as amount_in_range,
CASE
WHEN anofox_money_is_positive(order_amount) IS TRUE
AND anofox_money_is_valid_amount(order_amount, 0.01, 999999.99) IS TRUE
THEN 'PASS'
ELSE 'FAIL'
END as amount_validation_status
FROM raw_customers;
Step 4: Combine All Validations
CREATE TABLE all_validations AS
SELECT
raw.customer_id,
raw.email,
raw.phone,
raw.vat_id,
raw.order_amount,
email_validation_status,
phone_validation_status,
vat_validation_status,
amount_validation_status,
CASE
WHEN email_validation_status = 'PASS'
AND phone_validation_status = 'PASS'
AND vat_validation_status = 'PASS'
AND amount_validation_status = 'PASS'
THEN 'ALL_PASS'
ELSE 'SOME_FAIL'
END as overall_status
FROM raw_customers raw
LEFT JOIN validated_emails USING (customer_id)
LEFT JOIN validated_phones USING (customer_id)
LEFT JOIN validated_vat USING (customer_id)
LEFT JOIN validated_amounts USING (customer_id);
Step 5: Separate Valid from Invalid
-- Valid records: Pass all validation gates
CREATE TABLE valid_customers AS
SELECT
customer_id,
email,
phone,
vat_id,
order_amount
FROM all_validations
WHERE overall_status = 'ALL_PASS';
-- Invalid records: Failed at least one gate (for review/correction)
CREATE TABLE invalid_customers AS
SELECT
customer_id,
email,
phone,
vat_id,
order_amount,
email_validation_status,
phone_validation_status,
vat_validation_status,
amount_validation_status
FROM all_validations
WHERE overall_status = 'SOME_FAIL';
Step 6: Generate Validation Report
SELECT
'Email' as validation_type,
COUNT(*) FILTER (WHERE email_validation_status = 'PASS') as passed,
COUNT(*) FILTER (WHERE email_validation_status = 'FAIL') as failed,
ROUND(100.0 * COUNT(*) FILTER (WHERE email_validation_status = 'PASS') / COUNT(*), 1) as pass_rate_pct
FROM all_validations
UNION ALL
SELECT
'Phone' as validation_type,
COUNT(*) FILTER (WHERE phone_validation_status = 'PASS') as passed,
COUNT(*) FILTER (WHERE phone_validation_status = 'FAIL') as failed,
ROUND(100.0 * COUNT(*) FILTER (WHERE phone_validation_status = 'PASS') / COUNT(*), 1) as pass_rate_pct
FROM all_validations
UNION ALL
SELECT
'VAT' as validation_type,
COUNT(*) FILTER (WHERE vat_validation_status = 'PASS') as passed,
COUNT(*) FILTER (WHERE vat_validation_status = 'FAIL') as failed,
ROUND(100.0 * COUNT(*) FILTER (WHERE vat_validation_status = 'PASS') / COUNT(*), 1) as pass_rate_pct
FROM all_validations
UNION ALL
SELECT
'Amount' as validation_type,
COUNT(*) FILTER (WHERE amount_validation_status = 'PASS') as passed,
COUNT(*) FILTER (WHERE amount_validation_status = 'FAIL') as failed,
ROUND(100.0 * COUNT(*) FILTER (WHERE amount_validation_status = 'PASS') / COUNT(*), 1) as pass_rate_pct
FROM all_validations
UNION ALL
SELECT
'OVERALL' as validation_type,
COUNT(*) FILTER (WHERE overall_status = 'ALL_PASS') as passed,
COUNT(*) FILTER (WHERE overall_status = 'SOME_FAIL') as failed,
ROUND(100.0 * COUNT(*) FILTER (WHERE overall_status = 'ALL_PASS') / COUNT(*), 1) as pass_rate_pct
FROM all_validations;
Output:
validation_type | passed | failed | pass_rate_pct
Email | 3 | 2 | 60.0
Phone | 3 | 2 | 60.0
VAT | 3 | 2 | 60.0
Amount | 4 | 1 | 80.0
OVERALL | 3 | 2 | 60.0
Step 7: Archive Invalid Records
Store invalid records for manual review and correction:
CREATE TABLE validation_failures AS
SELECT
customer_id,
'EMAIL' as failure_reason
FROM all_validations
WHERE email_validation_status = 'FAIL'
UNION ALL
SELECT
customer_id,
'PHONE' as failure_reason
FROM all_validations
WHERE phone_validation_status = 'FAIL'
UNION ALL
SELECT
customer_id,
'VAT' as failure_reason
FROM all_validations
WHERE vat_validation_status = 'FAIL'
UNION ALL
SELECT
customer_id,
'AMOUNT' as failure_reason
FROM all_validations
WHERE amount_validation_status = 'FAIL';
Key Takeaways
✅ Always profile before validating (understand your data)
✅ Validate each field with appropriate methods
✅ Combine validations to create gates
✅ Separate valid/invalid for different processing paths
✅ Report on validation rates
✅ Archive failures for review
Next Steps
- Email Validation — Deep dive into email verification
- Financial Compliance — VAT and multi-currency rules
- Production Deployment — Scale this pipeline to production