Spaces:
Runtime error
Runtime error
File size: 7,032 Bytes
f92be26 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 | """
Master Dataset Seeding Script
This script populates Qdrant with:
1. SEC Forms (10-K) - Corporate compliance benchmark
2. Regulatory Data (GDPR, DPDP Act, LexGLUE) - Legal reference
3. Prepares Legal Case Data - Risk scorer training
Usage:
cd backend && python scripts/seed_datasets.py
Or with options:
python scripts/seed_datasets.py --sec-only
python scripts/seed_datasets.py --regulatory-only
python scripts/seed_datasets.py --prepare-ml-data
python scripts/seed_datasets.py --all
"""
import sys
import os
import argparse
import asyncio
import logging
from typing import Dict
# Add parent directory to path
sys.path.insert(0, os.path.dirname(os.path.dirname(__file__)))
from services.dataset_loader import (
SECDatasetSeeder,
LegalCaseDatasetPreparer,
RegulatoryDatasetSeeder,
)
from services.qdrant_service import ensure_collection_exists
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)
async def seed_all_datasets(
seed_sec: bool = True,
seed_regulatory: bool = True,
prepare_ml: bool = True,
sec_max_companies: int = 100,
) -> Dict[str, int]:
"""
Seed all datasets into Qdrant.
Returns:
Dict with counts of what was seeded
"""
results = {}
print("\n" + "="*80)
print("CODEWIZARDS AI COMPLIANCE SYSTEM - DATASET SEEDING")
print("="*80)
# Ensure Qdrant collection exists
print("\n[1/4] Ensuring Qdrant collection...")
try:
ensure_collection_exists()
print("✅ Qdrant collection ready\n")
except Exception as e:
print(f"❌ Error ensuring Qdrant collection: {e}")
return results
# Seed SEC Dataset
if seed_sec:
print("[2/4] Seeding SEC Form 10-K Dataset...")
print(" (Corporate compliance benchmark - Hybrid Storage Active)")
try:
sec_chunks = await SECDatasetSeeder.seed_sec_data(
max_companies=sec_max_companies,
max_documents_per_company=1,
summarize=True,
)
results["sec_chunks"] = sec_chunks
except Exception as e:
logger.error(f"Error seeding SEC data: {e}")
results["sec_chunks"] = 0
else:
print("[2/4] ⊘ Skipping SEC Dataset")
# Seed Regulatory Dataset
if seed_regulatory:
print("\n[3/4] Seeding Regulatory Data...")
print(" (GDPR, DPDP Act, LexGLUE benchmark)")
try:
reg_chunks = RegulatoryDatasetSeeder.seed_regulatory_data()
lex_chunks = RegulatoryDatasetSeeder.seed_from_lexglue()
gdpr_chunks = await RegulatoryDatasetSeeder.seed_gdpr_cases(max_samples=500)
custom_chunks = RegulatoryDatasetSeeder.seed_custom_training_data()
results["regulatory_chunks"] = reg_chunks + lex_chunks + gdpr_chunks + custom_chunks
except Exception as e:
logger.error(f"Error seeding regulatory data: {e}")
results["regulatory_chunks"] = 0
else:
print("\n[3/4] ⊘ Skipping Regulatory Dataset")
# Prepare ML Training Data
if prepare_ml:
print("\n[4/4] Preparing Legal Case Data for ML Risk Scorer...")
print(" (Training data for predicting risk levels 0-100)")
try:
ml_samples = LegalCaseDatasetPreparer.prepare_legal_case_data(
output_file="backend/ml/legal_training_data.jsonl",
max_samples=5000,
)
results["ml_training_samples"] = ml_samples
except Exception as e:
logger.error(f"Error preparing ML data: {e}")
results["ml_training_samples"] = 0
else:
print("\n[4/4] ⊘ Skipping ML Data Preparation")
# Summary
print("\n" + "="*80)
print("SEEDING COMPLETE - SUMMARY")
print("="*80)
total_qdrant_chunks = results.get("sec_chunks", 0) + results.get("regulatory_chunks", 0)
print(f"\n✅ SEC Dataset: {results.get('sec_chunks', 0):,} chunks")
print(f"✅ Regulatory Dataset: {results.get('regulatory_chunks', 0):,} chunks")
print(f"✅ ML Training Samples: {results.get('ml_training_samples', 0):,} samples")
print(f"\n📊 Total Qdrant Vectors: {total_qdrant_chunks:,}")
if total_qdrant_chunks > 0:
print(f"\n🎯 Your Qdrant database now contains {total_qdrant_chunks:,} regulatory/compliance chunks.")
print(" When you ingest a new policy, the system will:")
print(" 1. Search Qdrant for similar regulations")
print(" 2. Compare against SEC benchmark practices")
print(" 3. Predict risk level using the trained ML model")
print(" 4. Generate remediation recommendations via Gemini API")
print("\n" + "="*80)
print("NEXT STEPS:")
print("="*80)
print("1. Ingest your company policies via the FastAPI endpoint")
print("2. Upload a new regulation and trigger impact analysis")
print("3. Watch Qdrant find similar docs + ML score the risk")
print("\nConsult README.md and LLM.md for full API documentation.")
print("="*80 + "\n")
return results
def main():
"""CLI entrypoint"""
parser = argparse.ArgumentParser(
description="Seed CodeWizards compliance system with regulatory datasets"
)
parser.add_argument(
"--sec-only",
action="store_true",
help="Seed only SEC dataset"
)
parser.add_argument(
"--regulatory-only",
action="store_true",
help="Seed only regulatory data"
)
parser.add_argument(
"--prepare-ml-data",
action="store_true",
help="Prepare ML training data only"
)
parser.add_argument(
"--all",
action="store_true",
help="Seed everything (default if no flag specified)"
)
parser.add_argument(
"--sec-companies",
type=int,
default=100,
help="Max SEC companies to load (default: 100)"
)
args = parser.parse_args()
# Determine what to seed
seed_sec = seed_regulatory = prepare_ml = False
if args.sec_only:
seed_sec = True
elif args.regulatory_only:
seed_regulatory = True
elif args.prepare_ml_data:
prepare_ml = True
else:
# Default: seed everything
seed_sec = seed_regulatory = prepare_ml = True
# Run seeding
results = asyncio.run(seed_all_datasets(
seed_sec=seed_sec,
seed_regulatory=seed_regulatory,
prepare_ml=prepare_ml,
sec_max_companies=args.sec_companies,
))
# Exit with appropriate code
if results.get("sec_chunks", 0) + results.get("regulatory_chunks", 0) + results.get("ml_training_samples", 0) > 0:
sys.exit(0)
else:
print("⚠️ No data was seeded. Check logs for errors.")
sys.exit(1)
if __name__ == "__main__":
main()
|