| import os, json, re |
| from typing import List, Dict |
| from fastapi import FastAPI, Request |
| from fastapi.staticfiles import StaticFiles |
| from fastapi.responses import HTMLResponse, FileResponse |
| from pydantic import BaseModel |
|
|
| import faiss |
| import numpy as np |
| from sentence_transformers import SentenceTransformer |
| from groq import Groq |
| from gtts import gTTS |
|
|
| |
| |
| |
| GROQ_LLM_KEY = os.environ.get("GROQ_LLM_KEY") |
| GROQ_WHISPER_KEY = os.environ.get("GROQ_WHISPER_KEY") |
|
|
| EMBED_MODEL = "all-MiniLM-L6-v2" |
| LLM_MODEL = "llama-3.3-70b-versatile" |
| TRANS_MODEL = "llama-3.1-8b-instant" |
| WHISPER_MODEL = "whisper-large-v3" |
| DIM = 384 |
|
|
| VALID_DOMAINS = {"islamic", "inheritance", "harassment", "general", "verify"} |
|
|
| DEPARTMENTS = { |
| "muslim family laws ordinance": "Union Council / Family Court", |
| "dowry": "Family Court or Police", |
| "succession": "Family Court + Revenue Office", |
| "harassment of women at workplace": "Ombudsperson", |
| "domestic violence": "District Women Protection Committee", |
| "penal code": "Local Police Station", |
| "constitution": "High Court", |
| "family courts": "Family Court", |
| } |
|
|
| |
| |
| |
| print("β³ Loading...") |
| embedder = SentenceTransformer(EMBED_MODEL) |
| index = faiss.read_index("faiss_index.bin") |
|
|
| with open("chunks_meta.json", "r") as f: |
| chunk_metas = json.load(f) |
|
|
| chunk_texts = [] |
| with open("all_chunks.jsonl", "r") as f: |
| for line in f: |
| chunk_texts.append(json.loads(line.strip())["text"]) |
|
|
| llm_client = Groq(api_key=GROQ_LLM_KEY) |
| whisper_client = Groq(api_key=GROQ_WHISPER_KEY) |
|
|
| print(f"β
Ready: {index.ntotal} vectors") |
|
|
| |
| |
| |
| def is_urdu_script(text: str) -> bool: |
| for ch in text: |
| if ('\u0600' <= ch <= '\u06FF' or '\u0750' <= ch <= '\u077F' or |
| '\uFB50' <= ch <= '\uFDFF' or '\uFE70' <= ch <= '\uFEFF'): |
| return True |
| return False |
|
|
| def detect_language(text: str) -> str: |
| if is_urdu_script(text): return "urdu" |
| roman = {"kya","hai","mera","talaq","mehr","virasat","shohar","biwi", |
| "case","police","court","ka","ki","ke","ko","se","mein", |
| "karna","kaise","nahi","haan","batao","chahiye","hoga","apna"} |
| words = text.lower().split() |
| if not words: return "en" |
| hits = sum(1 for w in words if w in roman) |
| return "roman_urdu" if (hits/len(words)) > 0.12 else "en" |
|
|
| def translate(text: str, target: str) -> str: |
| if target == "urdu": |
| prompt = f"Translate to proper Urdu (Arabic script). Simple Pakistani Urdu:\n\n{text}" |
| elif target == "roman": |
| prompt = f"Translate to Roman Urdu (English alphabet):\n\n{text}" |
| elif target == "en": |
| src = "Urdu" if is_urdu_script(text) else "Roman Urdu" |
| prompt = f"Translate this {src} to English:\n\n{text}" |
| else: return text |
|
|
| resp = llm_client.chat.completions.create( |
| model=TRANS_MODEL, |
| messages=[{"role": "user", "content": prompt}], |
| temperature=0.2, max_tokens=1200 |
| ) |
| return resp.choices[0].message.content.strip() |
|
|
| def retrieve_chunks(query: str, top_k: int = 5, domain_filter: str = None): |
| q_vec = embedder.encode([query], normalize_embeddings=True).astype("float32") |
| scores, indices = index.search(q_vec, top_k * 3) |
| results = [] |
| for score, idx in zip(scores[0], indices[0]): |
| if idx == -1: continue |
| meta = chunk_metas[idx] |
| if domain_filter and domain_filter != "verify": |
| if meta["domain"] != domain_filter and meta["domain"] != "general": |
| continue |
| results.append((score, chunk_texts[idx], meta)) |
| if len(results) >= top_k: break |
| if not results: return [], [] |
| return [r[1] for r in results], [r[2] for r in results] |
|
|
| def infer_dept(meta_list): |
| for meta in meta_list: |
| act = meta.get("act_name","").lower() |
| for k,v in DEPARTMENTS.items(): |
| if k in act: return v |
| return "Relevant District Court" |
|
|
| def generate_answer(query: str, domain: str, mode: str, lang: str) -> dict: |
| if domain not in VALID_DOMAINS: |
| return {"error": "This is outside my domain.", "verdict": "OUT OF DOMAIN"} |
|
|
| d_filter = None if domain == "verify" else domain |
| texts, metas = retrieve_chunks(query, top_k=5, domain_filter=d_filter) |
|
|
| if not texts: |
| return { |
| "plain_answer": "This specific point is not in my verified database.", |
| "law_cited": "N/A", |
| "verdict": "NOT IN DATABASE", |
| "where_to_file": infer_dept([{"act_name": domain}]), |
| "department": infer_dept([{"act_name": domain}]) |
| } |
|
|
| context = "\n\n---\n\n".join([ |
| f"CHUNK {i+1} [Act: {m['act_name']}, Sec: {m['section']}, Year: {m['year']}]:\n{t}" |
| for i,(t,m) in enumerate(zip(texts,metas)) |
| ]) |
|
|
| sys_msg = """You are a Pakistani legal educator. Rules: |
| 1) ONLY use provided chunks. 2) No answer β NOT IN DATABASE + dept. |
| 3) ALWAYS cite Act, Section, Year. 4) No paraphrasing beyond simplification. |
| 5) NEVER combine laws. 6) Domain lock. |
| |
| FORMAT: |
| PLAIN ANSWER: [1-3 sentences] |
| LAW CITED: [Act, Section, Year] |
| VERDICT: VERIFIED / NOT IN DATABASE |
| WHERE TO FILE: [Exact department + step]""" |
|
|
| if mode == "verify": |
| sys_msg = """Fact-checker. Use ONLY chunks. |
| VERDICT: VERIFIED / FARCE / NOT IN DATABASE |
| LAW CITED: [Act, Section, Year] |
| EVIDENCE: [Quote]""" |
|
|
| dept_hint = f"\n\n[DIRECT TO]: {infer_dept(metas)}" |
| messages = [ |
| {"role": "system", "content": sys_msg + dept_hint}, |
| {"role": "user", "content": f"[CHUNKS]\n{context}\n\n[QUERY]\n{query}"} |
| ] |
|
|
| resp = llm_client.chat.completions.create( |
| model=LLM_MODEL, messages=messages, temperature=0.1, max_tokens=1200 |
| ) |
| raw = resp.choices[0].message.content.strip() |
|
|
| |
| parsed = {"plain_answer": "", "law_cited": "", "verdict": "VERIFIED", |
| "where_to_file": "", "department": infer_dept(metas)} |
|
|
| for line in raw.split('\n'): |
| if line.startswith("PLAIN ANSWER:"): |
| parsed["plain_answer"] = line.replace("PLAIN ANSWER:", "").strip() |
| elif line.startswith("LAW CITED:"): |
| parsed["law_cited"] = line.replace("LAW CITED:", "").strip() |
| elif line.startswith("VERDICT:"): |
| parsed["verdict"] = line.replace("VERDICT:", "").strip() |
| elif line.startswith("WHERE TO FILE:"): |
| parsed["where_to_file"] = line.replace("WHERE TO FILE:", "").strip() |
|
|
| if not parsed["plain_answer"]: |
| parsed["plain_answer"] = raw[:500] |
|
|
| |
| if lang == "urdu": |
| parsed["plain_answer"] = translate(parsed["plain_answer"], "urdu") |
| parsed["law_cited"] = translate(parsed["law_cited"], "urdu") if parsed["law_cited"] else "" |
| parsed["where_to_file"] = translate(parsed["where_to_file"], "urdu") if parsed["where_to_file"] else "" |
| elif lang == "roman": |
| parsed["plain_answer"] = translate(parsed["plain_answer"], "roman") |
| parsed["law_cited"] = translate(parsed["law_cited"], "roman") if parsed["law_cited"] else "" |
| parsed["where_to_file"] = translate(parsed["where_to_file"], "roman") if parsed["where_to_file"] else "" |
|
|
| return parsed |
|
|
| |
| |
| |
| app = FastAPI(title="QaanoonSathi AI") |
|
|
| |
| app.mount("/static", StaticFiles(directory="static"), name="static") |
|
|
| class QueryRequest(BaseModel): |
| query: str |
| domain: str = "general" |
| lang: str = "en" |
| mode: str = "qa" |
|
|
| @app.get("/", response_class=HTMLResponse) |
| async def home(): |
| return FileResponse("static/index.html") |
|
|
| @app.post("/api/ask") |
| async def ask(req: QueryRequest): |
| |
| detected = detect_language(req.query) |
| q = req.query |
| if detected in ("urdu", "roman_urdu"): |
| q = translate(req.query, "en") |
|
|
| result = generate_answer(q, req.domain, req.mode, req.lang) |
| result["detected_input_lang"] = detected |
| result["original_query"] = req.query |
| return result |
|
|
| @app.post("/api/voice") |
| async def voice_input(audio: bytes, domain: str = "general", lang: str = "en"): |
| |
| with open("/tmp/input.wav", "wb") as f: |
| f.write(audio) |
| |
| |
| with open("/tmp/input.wav", "rb") as f: |
| transcript = whisper_client.audio.transcriptions.create( |
| model=WHISPER_MODEL, file=f, response_format="text" |
| ) |
| |
| |
| detected = detect_language(transcript) |
| q = translate(transcript, "en") if detected in ("urdu", "roman_urdu") else transcript |
| |
| result = generate_answer(q, domain, "qa", lang) |
| result["transcript"] = transcript |
| result["detected_input_lang"] = detected |
| return result |
|
|
| @app.get("/api/tts") |
| async def text_to_speech_endpoint(text: str, lang: str = "en"): |
| clean = re.sub(r'PLAIN ANSWER:|LAW CITED:|VERDICT:|WHERE TO FILE:', '', text) |
| clean = re.sub(r'\n+', ' ', clean).strip() |
| gl = "ur" if lang == "urdu" else "en" |
| |
| tts = gTTS(text=clean, lang=gl, slow=False) |
| tts.save("/tmp/out.mp3") |
| return FileResponse("/tmp/out.mp3", media_type="audio/mpeg") |