Spaces:
Running
Running
barathvasan-dev
Upgrade: Professional multi-filter NLP-to-SQL engine with support for combined queries, synonyms, and all intent types
17ff25c | # ========================================================= | |
| # ULTRA ADVANCED HYBRID NLP TO SQL ENGINE | |
| # RULE BASED + LLM BASED + SQL SAFETY | |
| # MISTRAL / SQLCODER READY | |
| # ========================================================= | |
| import re | |
| import traceback | |
| import os | |
| from huggingface_hub import InferenceClient | |
| from dotenv import load_dotenv | |
| from sqlalchemy import create_engine, text | |
| # ========================================================= | |
| # ENVIRONMENT SETUP | |
| # ========================================================= | |
| load_dotenv() | |
| HF_TOKEN = os.getenv("HF_TOKEN") | |
| DATABASE_URL = os.getenv("DATABASE_URL") | |
| # Initialize Mistral client | |
| client = None | |
| try: | |
| if HF_TOKEN: | |
| client = InferenceClient( | |
| model="mistralai/Mistral-7B-Instruct-v0.2", | |
| token=HF_TOKEN | |
| ) | |
| print("β Mistral client initialized") | |
| else: | |
| print("β οΈ HF_TOKEN not set - LLM features disabled") | |
| except Exception as e: | |
| print(f"β οΈ Mistral client error: {e}") | |
| client = None | |
| # Initialize database engine | |
| engine = None | |
| try: | |
| if DATABASE_URL: | |
| engine = create_engine(DATABASE_URL) | |
| print("β Database connection initialized") | |
| else: | |
| print("β οΈ DATABASE_URL not set - Database features disabled") | |
| except Exception as e: | |
| print(f"β οΈ Database connection warning: {e}") | |
| engine = None | |
| # ========================================================= | |
| # CONFIG | |
| # ========================================================= | |
| USE_LLM = True | |
| # ========================================================= | |
| # DATABASE KNOWLEDGE | |
| # ========================================================= | |
| SCHEMA = { | |
| "table": "vehicle_logs", | |
| "columns": [ | |
| "timestamp", | |
| "plate", | |
| "state", | |
| "vehicle_type", | |
| "vehicle_conf", | |
| "camera_id", | |
| "location", | |
| "date", | |
| "hour", | |
| "day" | |
| ] | |
| } | |
| VALID_STATES = { | |
| "tn": "TN", | |
| "tamil nadu": "TN", | |
| "ka": "KA", | |
| "karnataka": "KA", | |
| "kl": "KL", | |
| "kerala": "KL", | |
| "ap": "AP", | |
| "andhra": "AP", | |
| "ts": "TS", | |
| "telangana": "TS", | |
| "mh": "MH", | |
| "maharashtra": "MH", | |
| "dl": "DL", | |
| "delhi": "DL", | |
| "gj": "GJ", | |
| "gujarat": "GJ", | |
| "rj": "RJ", | |
| "rajasthan": "RJ", | |
| "up": "UP", | |
| "uttar pradesh": "UP", | |
| "wb": "WB", | |
| "west bengal": "WB", | |
| "hr": "HR", | |
| "haryana": "HR", | |
| "pb": "PB", | |
| "punjab": "PB" | |
| } | |
| KNOWN_LOCATIONS = [ | |
| "adyar", | |
| "guindy", | |
| "velachery", | |
| "besantnagar", | |
| "besant nagar", | |
| "thiruvanmiyur", | |
| "tnagar", | |
| "t nagar", | |
| "mylapore", | |
| "annanagar", | |
| "anna nagar", | |
| "koyambedu", | |
| "nungambakkam", | |
| "kotturpuram" | |
| ] | |
| VEHICLE_TYPES = [ | |
| "suv", | |
| "bus", | |
| "truck", | |
| "bike", | |
| "auto", | |
| "taxi", | |
| "car", | |
| "jeep", | |
| "sedan" | |
| ] | |
| # ========================================================= | |
| # SQL CLEANER | |
| # ========================================================= | |
| def clean_sql(sql): | |
| sql = sql.replace("```sql", "") | |
| sql = sql.replace("```", "") | |
| sql = sql.strip() | |
| if not sql.endswith(";"): | |
| sql += ";" | |
| return sql | |
| # ========================================================= | |
| # SQL VALIDATOR | |
| # ========================================================= | |
| def validate_sql(sql): | |
| blocked = [ | |
| "DROP", | |
| "DELETE", | |
| "UPDATE", | |
| "INSERT", | |
| "ALTER", | |
| "CREATE", | |
| "TRUNCATE", | |
| "JOIN", | |
| "UNION" | |
| ] | |
| upper = sql.upper() | |
| for word in blocked: | |
| if word in upper: | |
| return False | |
| if not upper.startswith("SELECT"): | |
| return False | |
| if "VEHICLE_LOGS" not in upper: | |
| return False | |
| return True | |
| # ========================================================= | |
| # MAIN NLP TO SQL ENGINE | |
| # ========================================================= | |
| def ask_llm(user_query): | |
| q = user_query.lower().strip() | |
| # ===================================================== | |
| # ENTITY EXTRACTION | |
| # ===================================================== | |
| plate_match = re.search( | |
| r'([A-Z]{2}\d{1,2}[A-Z]{1,3}\d{3,4})', | |
| user_query.upper() | |
| ) | |
| date_match = re.search( | |
| r'(\d{4}-\d{2}-\d{2})', | |
| q | |
| ) | |
| # ===================================================== | |
| # INTENT DETECTION | |
| # ===================================================== | |
| intents = { | |
| "tracking": | |
| any(k in q for k in [ | |
| "track", | |
| "history", | |
| "movement", | |
| "travel", | |
| "route", | |
| "visited", | |
| "where" | |
| ]), | |
| "count": | |
| any(k in q for k in [ | |
| "count", | |
| "how many", | |
| "total" | |
| ]), | |
| "analytics": | |
| any(k in q for k in [ | |
| "top", | |
| "most", | |
| "distribution", | |
| "analysis", | |
| "statistics", | |
| "peak" | |
| ]), | |
| "latest": | |
| any(k in q for k in [ | |
| "latest", | |
| "recent", | |
| "last" | |
| ]) | |
| } | |
| # ===================================================== | |
| # RULE BASED ENGINE | |
| # ===================================================== | |
| # ===================================================== | |
| # PLATE TRACKING | |
| # ===================================================== | |
| if plate_match: | |
| plate = plate_match.group(1) | |
| # TRACKING | |
| if intents["tracking"]: | |
| return clean_sql(f""" | |
| SELECT | |
| timestamp, | |
| plate, | |
| state, | |
| vehicle_type, | |
| location, | |
| camera_id, | |
| date, | |
| hour, | |
| day | |
| FROM vehicle_logs | |
| WHERE plate = '{plate}' | |
| ORDER BY timestamp DESC | |
| LIMIT 100 | |
| """) | |
| # COUNT | |
| if intents["count"]: | |
| return clean_sql(f""" | |
| SELECT | |
| plate, | |
| COUNT(*) as detections, | |
| COUNT(DISTINCT location) as unique_locations, | |
| COUNT(DISTINCT date) as active_days | |
| FROM vehicle_logs | |
| WHERE plate = '{plate}' | |
| GROUP BY plate | |
| """) | |
| # DEFAULT | |
| return clean_sql(f""" | |
| SELECT * | |
| FROM vehicle_logs | |
| WHERE plate = '{plate}' | |
| ORDER BY timestamp DESC | |
| LIMIT 50 | |
| """) | |
| # ===================================================== | |
| # STATE QUERIES | |
| # ===================================================== | |
| for key, state in VALID_STATES.items(): | |
| if key in q: | |
| if intents["count"]: | |
| return clean_sql(f""" | |
| SELECT | |
| state, | |
| COUNT(*) as total_detections, | |
| COUNT(DISTINCT plate) as unique_vehicles | |
| FROM vehicle_logs | |
| WHERE state = '{state}' | |
| GROUP BY state | |
| """) | |
| return clean_sql(f""" | |
| SELECT * | |
| FROM vehicle_logs | |
| WHERE state = '{state}' | |
| ORDER BY timestamp DESC | |
| LIMIT 100 | |
| """) | |
| # ===================================================== | |
| # LOCATION QUERIES | |
| # ===================================================== | |
| for loc in KNOWN_LOCATIONS: | |
| if loc in q: | |
| # COUNT | |
| if intents["count"]: | |
| return clean_sql(f""" | |
| SELECT | |
| location, | |
| COUNT(*) as detections, | |
| COUNT(DISTINCT plate) as unique_vehicles | |
| FROM vehicle_logs | |
| WHERE LOWER(location) LIKE '%{loc}%' | |
| GROUP BY location | |
| ORDER BY detections DESC | |
| """) | |
| # DEFAULT | |
| return clean_sql(f""" | |
| SELECT | |
| timestamp, | |
| plate, | |
| state, | |
| vehicle_type, | |
| location, | |
| camera_id | |
| FROM vehicle_logs | |
| WHERE LOWER(location) LIKE '%{loc}%' | |
| ORDER BY timestamp DESC | |
| LIMIT 100 | |
| """) | |
| # ===================================================== | |
| # VEHICLE TYPE | |
| # ===================================================== | |
| for vtype in VEHICLE_TYPES: | |
| if vtype in q: | |
| if intents["count"]: | |
| return clean_sql(f""" | |
| SELECT | |
| vehicle_type, | |
| COUNT(*) as count | |
| FROM vehicle_logs | |
| WHERE LOWER(vehicle_type) LIKE '%{vtype}%' | |
| GROUP BY vehicle_type | |
| """) | |
| return clean_sql(f""" | |
| SELECT * | |
| FROM vehicle_logs | |
| WHERE LOWER(vehicle_type) LIKE '%{vtype}%' | |
| ORDER BY timestamp DESC | |
| LIMIT 50 | |
| """) | |
| # ===================================================== | |
| # DATE QUERY | |
| # ===================================================== | |
| if date_match: | |
| d = date_match.group(1) | |
| return clean_sql(f""" | |
| SELECT * | |
| FROM vehicle_logs | |
| WHERE date = '{d}' | |
| ORDER BY timestamp DESC | |
| LIMIT 100 | |
| """) | |
| # ===================================================== | |
| # ANALYTICS | |
| # ===================================================== | |
| if "hourly traffic" in q or "traffic by hour" in q: | |
| return clean_sql(""" | |
| SELECT | |
| hour, | |
| COUNT(*) as traffic | |
| FROM vehicle_logs | |
| GROUP BY hour | |
| ORDER BY hour | |
| """) | |
| if "top vehicles" in q or "most detected" in q: | |
| return clean_sql(""" | |
| SELECT | |
| plate, | |
| COUNT(*) as detections | |
| FROM vehicle_logs | |
| GROUP BY plate | |
| ORDER BY detections DESC | |
| LIMIT 20 | |
| """) | |
| if "state distribution" in q: | |
| return clean_sql(""" | |
| SELECT | |
| state, | |
| COUNT(*) as count | |
| FROM vehicle_logs | |
| GROUP BY state | |
| ORDER BY count DESC | |
| """) | |
| if "vehicle type distribution" in q: | |
| return clean_sql(""" | |
| SELECT | |
| vehicle_type, | |
| COUNT(*) as count | |
| FROM vehicle_logs | |
| GROUP BY vehicle_type | |
| ORDER BY count DESC | |
| """) | |
| if "latest" in q or "recent" in q: | |
| return clean_sql(""" | |
| SELECT * | |
| FROM vehicle_logs | |
| ORDER BY timestamp DESC | |
| LIMIT 50 | |
| """) | |
| # ===================================================== | |
| # LLM FALLBACK | |
| # ===================================================== | |
| if not USE_LLM: | |
| return clean_sql(""" | |
| SELECT * | |
| FROM vehicle_logs | |
| ORDER BY timestamp DESC | |
| LIMIT 10 | |
| """) | |
| # ===================================================== | |
| # SYSTEM PROMPT | |
| # ===================================================== | |
| system_prompt = f""" | |
| You are an elite PostgreSQL SQL generator. | |
| Your job: | |
| Convert natural language into VALID PostgreSQL SQL. | |
| ================================================== | |
| DATABASE | |
| ================================================== | |
| TABLE: | |
| vehicle_logs | |
| AVAILABLE COLUMNS: | |
| timestamp | |
| plate | |
| state | |
| vehicle_type | |
| vehicle_conf | |
| camera_id | |
| location | |
| date | |
| hour | |
| day | |
| ================================================== | |
| COLUMN MEANINGS | |
| ================================================== | |
| timestamp: | |
| vehicle detection timestamp | |
| plate: | |
| vehicle number plate | |
| state: | |
| vehicle state code | |
| vehicle_type: | |
| type of vehicle | |
| vehicle_conf: | |
| AI detection confidence | |
| camera_id: | |
| CCTV camera ID | |
| location: | |
| detected location | |
| date: | |
| YYYY-MM-DD | |
| hour: | |
| 0-23 | |
| day: | |
| Monday-Sunday | |
| ================================================== | |
| KNOWN STATES | |
| ================================================== | |
| TN | |
| KA | |
| KL | |
| AP | |
| TS | |
| MH | |
| DL | |
| GJ | |
| RJ | |
| UP | |
| WB | |
| HR | |
| PB | |
| ================================================== | |
| KNOWN LOCATIONS | |
| ================================================== | |
| {KNOWN_LOCATIONS} | |
| ================================================== | |
| STRICT RULES | |
| ================================================== | |
| 1. ONLY use vehicle_logs | |
| 2. NEVER use JOIN | |
| 3. NEVER invent tables | |
| 4. NEVER invent columns | |
| 5. ONLY SELECT queries | |
| 6. NEVER use UPDATE | |
| 7. NEVER use DELETE | |
| 8. NEVER use DROP | |
| 9. NEVER use ALTER | |
| 10. PostgreSQL syntax only | |
| 11. Always use LIMIT 50 or LIMIT 100 | |
| 12. Return SQL ONLY | |
| 13. No markdown | |
| 14. No explanation | |
| ================================================== | |
| QUERY UNDERSTANDING | |
| ================================================== | |
| track vehicle | |
| β WHERE plate='' | |
| show TN vehicles | |
| β WHERE state='TN' | |
| show vehicles from adyar | |
| β WHERE LOWER(location) LIKE '%adyar%' | |
| top vehicles | |
| β GROUP BY plate | |
| hourly traffic | |
| β GROUP BY hour | |
| vehicle type distribution | |
| β GROUP BY vehicle_type | |
| latest detections | |
| β ORDER BY timestamp DESC | |
| ================================================== | |
| GOOD EXAMPLES | |
| ================================================== | |
| SELECT * | |
| FROM vehicle_logs | |
| WHERE state='TN' | |
| ORDER BY timestamp DESC | |
| LIMIT 50; | |
| SELECT * | |
| FROM vehicle_logs | |
| WHERE LOWER(location) LIKE '%adyar%' | |
| ORDER BY timestamp DESC | |
| LIMIT 50; | |
| SELECT | |
| plate, | |
| COUNT(*) as detections | |
| FROM vehicle_logs | |
| GROUP BY plate | |
| ORDER BY detections DESC | |
| LIMIT 20; | |
| SELECT * | |
| FROM vehicle_logs | |
| WHERE plate='TN63MB3157' | |
| ORDER BY timestamp DESC | |
| LIMIT 100; | |
| """ | |
| user_prompt = f""" | |
| Generate PostgreSQL SQL query for: | |
| {user_query} | |
| """ | |
| # ===================================================== | |
| # MISTRAL / SQLCODER CALL | |
| # ===================================================== | |
| try: | |
| if client is None: | |
| print("β Mistral client not initialized - HF_TOKEN missing") | |
| raise Exception("LLM service unavailable - HF_TOKEN not configured") | |
| try: | |
| response = client.chat_completion( | |
| messages=[ | |
| { | |
| "role": "system", | |
| "content": system_prompt | |
| }, | |
| { | |
| "role": "user", | |
| "content": user_prompt | |
| } | |
| ], | |
| max_tokens=250, | |
| temperature=0.05 | |
| ) | |
| sql = response.choices[0].message.content.strip() | |
| except Exception as api_error: | |
| print(f"β οΈ API timeout or error: {api_error}") | |
| # Fallback to rule-based query if LLM times out | |
| print("β οΈ Using fallback query due to API timeout") | |
| return clean_sql(""" | |
| SELECT * | |
| FROM vehicle_logs | |
| ORDER BY timestamp DESC | |
| LIMIT 10 | |
| """) | |
| sql = clean_sql(sql) | |
| # ================================================= | |
| # SAFETY | |
| # ================================================= | |
| if not validate_sql(sql): | |
| print("β SQL validation failed - using safe query") | |
| return clean_sql(""" | |
| SELECT * | |
| FROM vehicle_logs | |
| ORDER BY timestamp DESC | |
| LIMIT 10 | |
| """) | |
| # AUTO LIMIT | |
| if "LIMIT" not in sql.upper(): | |
| sql = sql.replace(";", " LIMIT 50;") | |
| return sql | |
| except Exception as e: | |
| print(f"β LLM ERROR: {e}") | |
| traceback.print_exc() | |
| return clean_sql(""" | |
| SELECT * | |
| FROM vehicle_logs | |
| ORDER BY timestamp DESC | |
| LIMIT 10 | |
| """) | |
| # ========================================================= | |
| # QUERY EXECUTION | |
| # ========================================================= | |
| def run_query(user_query): | |
| """Execute NLP-to-SQL query with timeout protection""" | |
| sql = "" | |
| try: | |
| sql = ask_llm(user_query) | |
| print("\n" + "="*40) | |
| print("USER QUERY:") | |
| print(user_query) | |
| print("\nGENERATED SQL:") | |
| print(sql) | |
| print("="*40) | |
| if engine is None: | |
| return { | |
| "query": user_query, | |
| "error": "β Database not configured - DATABASE_URL missing", | |
| "sql": sql, | |
| "result": [], | |
| "count": 0 | |
| } | |
| try: | |
| # Execute with timeout protection | |
| with engine.connect() as conn: | |
| # Set statement timeout to 30 seconds | |
| conn.execute(text("SET statement_timeout = 30000")) # 30 seconds | |
| result = conn.execute(text(sql)) | |
| rows = [ | |
| dict(r._mapping) | |
| for r in result | |
| ] | |
| return { | |
| "query": user_query, | |
| "sql": sql, | |
| "count": len(rows), | |
| "result": rows | |
| } | |
| except Exception as query_error: | |
| print(f"β Query Execution Error (possible timeout): {query_error}") | |
| return { | |
| "query": user_query, | |
| "error": f"Query timeout or error: {str(query_error)}", | |
| "sql": sql, | |
| "result": [], | |
| "count": 0 | |
| } | |
| except Exception as e: | |
| print(f"β Run Query Error: {e}") | |
| traceback.print_exc() | |
| return { | |
| "query": user_query, | |
| "error": str(e), | |
| "sql": sql if sql else "", | |
| "result": [], | |
| "count": 0 | |
| } | |
| # ========================================================= | |
| # DATABASE OPERATIONS | |
| # ========================================================= | |
| def save_detection(plate, state, vehicle_type, vehicle_conf, date, time): | |
| """Save a vehicle detection to the database | |
| Note: The table schema uses timestamp, date, hour, day columns. | |
| The 'time' parameter is extracted to hour for the hour column. | |
| """ | |
| try: | |
| if engine is None: | |
| print("β οΈ Engine not initialized - save_detection skipped") | |
| return False | |
| # Extract hour from time string (HH:MM:SS) | |
| try: | |
| hour = int(time.split(":")[0]) if time else 0 | |
| except: | |
| hour = 0 | |
| # Extract day of week from date (simplified) | |
| from datetime import datetime | |
| try: | |
| dt = datetime.strptime(date, "%Y-%m-%d") | |
| day = dt.strftime("%A") | |
| except: | |
| day = "Unknown" | |
| # Use timestamp for current time, date for the date field, hour for hourly grouping | |
| query = f""" | |
| INSERT INTO vehicle_logs | |
| (plate, state, vehicle_type, vehicle_conf, date, hour, day, timestamp, camera_id, location) | |
| VALUES ('{plate}', '{state}', '{vehicle_type}', {vehicle_conf}, '{date}', {hour}, '{day}', NOW(), 'CAM-01', 'default') | |
| """ | |
| with engine.connect() as conn: | |
| conn.execute(text(query)) | |
| conn.commit() | |
| print(f"β Saved: {plate} from {state} at {time}") | |
| return True | |
| except Exception as e: | |
| print(f"β Save Error: {e}") | |
| traceback.print_exc() | |
| return False | |
| def health_check(): | |
| """Check database health with timeout protection""" | |
| try: | |
| if engine is None: | |
| return False, "β Database not configured" | |
| with engine.connect() as conn: | |
| conn.execute(text("SET statement_timeout = 10000")) # 10 second timeout | |
| result = conn.execute(text("SELECT COUNT(*) FROM vehicle_logs")) | |
| count = result.scalar() | |
| return True, f"β Database OK - {count} records" | |
| except Exception as e: | |
| print(f"β Health Check Error (timeout?): {e}") | |
| return False, f"β Database Error: {str(e)}" | |
| def get_vehicles_by_state(): | |
| """Get vehicle count by state with timeout protection""" | |
| try: | |
| sql = """ | |
| SELECT state, COUNT(*) as count | |
| FROM vehicle_logs | |
| GROUP BY state | |
| ORDER BY count DESC | |
| """ | |
| with engine.connect() as conn: | |
| conn.execute(text("SET statement_timeout = 15000")) # 15 second timeout | |
| result = conn.execute(text(sql)) | |
| rows = [dict(r._mapping) for r in result] | |
| return rows | |
| except Exception as e: | |
| print(f"β State Query Error (timeout?): {e}") | |
| return [] | |
| def get_hourly_traffic(): | |
| """Get traffic by hour with timeout protection""" | |
| try: | |
| sql = """ | |
| SELECT hour, COUNT(*) as traffic | |
| FROM vehicle_logs | |
| GROUP BY hour | |
| ORDER BY hour | |
| """ | |
| with engine.connect() as conn: | |
| conn.execute(text("SET statement_timeout = 15000")) # 15 second timeout | |
| result = conn.execute(text(sql)) | |
| rows = [dict(r._mapping) for r in result] | |
| return rows | |
| except Exception as e: | |
| print(f"β Hourly Traffic Error (timeout?): {e}") | |
| return [] | |
| def get_top_plates(): | |
| """Get top detected plates with timeout protection""" | |
| try: | |
| sql = """ | |
| SELECT plate, COUNT(*) as detections | |
| FROM vehicle_logs | |
| GROUP BY plate | |
| ORDER BY detections DESC | |
| LIMIT 20 | |
| """ | |
| with engine.connect() as conn: | |
| conn.execute(text("SET statement_timeout = 15000")) # 15 second timeout | |
| result = conn.execute(text(sql)) | |
| rows = [dict(r._mapping) for r in result] | |
| return rows | |
| except Exception as e: | |
| print(f"β Top Plates Error (timeout?): {e}") | |
| return [] | |
| def get_suspicious_vehicles(): | |
| """Get vehicles detected multiple times (potentially suspicious) with timeout protection""" | |
| try: | |
| sql = """ | |
| SELECT plate, state, COUNT(*) as detections, | |
| COUNT(DISTINCT location) as locations, | |
| COUNT(DISTINCT date) as days | |
| FROM vehicle_logs | |
| GROUP BY plate, state | |
| HAVING COUNT(*) > 5 | |
| ORDER BY detections DESC | |
| LIMIT 20 | |
| """ | |
| with engine.connect() as conn: | |
| conn.execute(text("SET statement_timeout = 15000")) # 15 second timeout | |
| result = conn.execute(text(sql)) | |
| rows = [dict(r._mapping) for r in result] | |
| return rows | |
| except Exception as e: | |
| print(f"β Suspicious Vehicles Error (timeout?): {e}") | |
| return [] |