import os import gradio as gr from io import BytesIO from typing import List, Dict, Tuple import pdfplumber from docx import Document from sentence_transformers import SentenceTransformer from pinecone import Pinecone import openai import tempfile import json import re import markdown from bs4 import BeautifulSoup from datetime import datetime from huggingface_hub import hf_hub_download, HfApi import pypandoc # ----------------- CONFIG ----------------- OPENAI_API_KEY = os.getenv("OPENAI_API_KEY") PINECONE_API_KEY = os.getenv("PINECONE_API_KEY") PINECONE_INDEX = "legal-ai" HF_DATASET_REPO = "omarkashif/legal-draft-templates" HF_TOKEN = os.getenv("HF_TOKEN") openai_client = openai.OpenAI(api_key=OPENAI_API_KEY) pc = Pinecone(api_key=PINECONE_API_KEY) index = pc.Index(PINECONE_INDEX) embedder = SentenceTransformer("all-mpnet-base-v2") api = HfApi() # ----------------- HELPERS ----------------- def load_reference_text(uploaded_file) -> str: if not uploaded_file: return "" if uploaded_file.name.lower().endswith(".docx"): doc = Document(uploaded_file) return "\n".join(p.text for p in doc.paragraphs) elif uploaded_file.name.lower().endswith(".pdf"): text = "" with pdfplumber.open(uploaded_file) as pdf: for page in pdf.pages: t = page.extract_text() if t: text += t + "\n" return text elif uploaded_file.name.lower().endswith(".txt"): return uploaded_file.read().decode("utf-8", errors="ignore") return "" def load_templates_json() -> List[Dict]: """Load templates.json from HF dataset repo.""" try: file_path = hf_hub_download( repo_id=HF_DATASET_REPO, filename="templates.json", repo_type="dataset" ) with open(file_path, "r", encoding="utf-8") as f: return json.load(f) except Exception: return [] def save_template_to_hf(name: str, analysis: str) -> Tuple[bool, str]: """Save new template into HF dataset repo (templates.json).""" try: # 1. Load latest file file_path = hf_hub_download( repo_id=HF_DATASET_REPO, filename="templates.json", repo_type="dataset" ) with open(file_path, "r", encoding="utf-8") as f: data = json.load(f) # 2. Check duplicates existing_names = [t.get("name") for t in data] if name in existing_names: return False, f"Template name '{name}' already exists." # 3. Append new template data.append({ "name": name, "analysis": analysis, "uploaded_at": datetime.utcnow().isoformat() }) # 4. Save locally with open(file_path, "w", encoding="utf-8") as f: json.dump(data, f, ensure_ascii=False, indent=2) # 5. Push back to repo api.upload_file( path_or_fileobj=file_path, path_in_repo="templates.json", repo_id=HF_DATASET_REPO, repo_type="dataset", commit_message=f"Add new template: {name}", token=HF_TOKEN ) return True, f"✅ Template '{name}' added to HF dataset." except Exception as e: return False, f"❌ Error saving template: {e}" def parse_json_safe(raw_text: str, fallback: str) -> List[str]: try: return json.loads(raw_text) except: matches = re.findall(r'"([^"]+)"', raw_text) if matches: return matches return [fallback[:512]] def build_queries_with_llm(user_text: str, max_queries: int = 15) -> List[str]: system_prompt = ( "You are a legal research assistant. " "A new petition needs to be drafted using the following client/case description. " "Devise 5–6 or more concise queries that will be helpful to retrieve relevant information " "from a knowledge base containing the Constitution of Pakistan, Punjab case law, " "and FBR tax ordinances. " "Return ONLY a JSON array of strings, no extra text." ) try: resp = openai_client.chat.completions.create( model="gpt-4o-mini", messages=[{"role": "system", "content": system_prompt}, {"role": "user", "content": user_text}], temperature=0.1, max_tokens=2000 ) raw = resp.choices[0].message.content.strip() return parse_json_safe(raw, user_text)[:max_queries] except Exception: return [user_text[:512]] def pinecone_search(queries: List[str], top_k: int = 10, max_chars: int = 10000) -> Tuple[str, List[Dict]]: seen_texts, context_parts, citations = set(), [], [] for q in queries: vec = embedder.encode(q).tolist() res = index.query(vector=vec, top_k=top_k, include_metadata=True) matches = res.get("matches", []) for m in matches: md = m.get("metadata", {}) txt = md.get("text") or "" if not txt or txt[:200] in seen_texts: continue seen_texts.add(txt[:200]) context_parts.append(f"- {txt.strip()}") citations.append({ "score": float(m.get("score") or 0.0), "source": md.get("chunk_id") or md.get("title") or "Unknown" }) if sum(len(p) for p in context_parts) > max_chars: break return "\n".join(context_parts), citations # def markdown_to_docx(md_text: str) -> str: # html = markdown.markdown(md_text) # soup = BeautifulSoup(html, "html.parser") # doc = Document() # for el in soup.descendants: # if el.name == "h1": # doc.add_heading(el.get_text(), level=1) # elif el.name == "h2": # doc.add_heading(el.get_text(), level=2) # elif el.name == "h3": # doc.add_heading(el.get_text(), level=3) # elif el.name == "p": # doc.add_paragraph(el.get_text()) # elif el.name == "li": # doc.add_paragraph(f"• {el.get_text()}") # tmp_path = os.path.join(tempfile.gettempdir(), "draft.docx") # doc.save(tmp_path) # return tmp_path def markdown_to_docx(md_text: str) -> str: """Convert Markdown text to DOCX using Pandoc (preserves full formatting).""" tmp_path = os.path.join(tempfile.gettempdir(), "draft.docx") try: pypandoc.convert_text( md_text, "docx", format="md", outputfile=tmp_path, extra_args=["--standalone"] ) return tmp_path except Exception as e: # Fallback simple converter from docx import Document doc = Document() # doc.add_paragraph("(Conversion via Pandoc failed — fallback applied.)") doc.add_paragraph(md_text) doc.save(tmp_path) return tmp_path # ----------------- ANALYZER ----------------- def analyze_template_draft(ref_text: str) -> str: if not ref_text: return "(no template provided)" system_prompt = """You are a legal draft analyzer. Your task is to carefully analyze the uploaded legal draft document and summarize its full structure and style. Extract the following information clearly and systematically: 1. Headings and subheadings (exact order). 2. Approximate length/word count per section. 3. Purpose of each section (what content it usually contains). 4. Writing style and tone (formal/informal, persuasive, assertive, etc.). 5. Formatting conventions (headings, numbering, bullet points, capitalization). 6. Sentence/paragraph length and complexity. 7. Any special legal phrases or terminology patterns. 8. Any notes on length and overall flow. Return a report that can be given as instructions to another model so it can treat this document as template to write a new legal draft based on this template (in terms of style, language, tone, length, format). Do not rewrite the draft, only analyze it.""" try: resp = openai_client.chat.completions.create( model="gpt-4o-mini", messages=[{"role": "system", "content": system_prompt}, {"role": "user", "content": ref_text[:40000]}], max_completion_tokens=1000, temperature = 0.1 ) return resp.choices[0].message.content.strip() except Exception as e: return f"(Analyzer error: {e})" # ----------------- MAIN ----------------- def generate_legal_draft(case_text, uploaded_file, template_name, new_template_name, add_citations=True): yield gr.update(value="🔍 Searching in Knowledge Base..."), None queries = build_queries_with_llm(case_text) context_text, citations = pinecone_search(queries, top_k=10) # Handle template template_analysis = "" if template_name and template_name != "None": # Load existing template templates = load_templates_json() chosen = next((t for t in templates if t["name"] == template_name), None) template_analysis = chosen["analysis"] if chosen else "(not found)" elif uploaded_file: yield gr.update(value="📝 Analyzing Uploaded Template..."), None ref_text = load_reference_text(uploaded_file) template_analysis = analyze_template_draft(ref_text) # Save to HF dataset if new_template_name: ok, msg = save_template_to_hf(new_template_name, template_analysis) yield gr.update(value=msg), None else: template_analysis = "(Template uploaded but no name provided)" yield gr.update(value="⚖️ Generating Final Draft..."), None system_prompt = """You are an expert legal drafter for Pakistani law. Your task is to create a professional, court-ready legal petition in MARKDOWN format using four inputs: 1. User Input: Case details including client info, petition type, court, facts, relevant laws, and sections. 2. Knowledge Base Context: Relevant laws, case precedents, and ordinances retrieved from the vector database (Constitution of Pakistan, Punjab case law, FBR ordinances). 3. Template Draft Analysis: A structured analysis of an uploaded legal template (headings, section purposes, tone, formatting rules, length, style). 4. Fallback: If some info is missing, state explicitly instead of hallucinating. Instructions - Replicate the section hierarchy and style described in the template analysis. - Ensure clarity, professionalism, and persuasive legal argumentation. - Integrate legal context where appropriate with accurate citations. - Output must be MARKDOWN format only, no explanations or extra commentary. """ user_prompt = f""" **User Input:** {case_text} **Knowledge Base Context:** {context_text or '(no matches)'} **Template Draft Analysis:** {template_analysis} """ try: resp = openai_client.chat.completions.create( model="gpt-5", messages=[{"role": "system", "content": system_prompt}, {"role": "user", "content": user_prompt}], max_completion_tokens=15000, verbosity="high" ) draft_md = resp.choices[0].message.content.strip() except Exception as e: draft_md = f"OpenAI error: {e}" if add_citations and citations: draft_md += "\n\n### References\n" for i, c in enumerate(citations, 1): draft_md += f"{i}. {c['source']} (score: {c['score']:.3f})\n" docx_path = markdown_to_docx(draft_md) yield gr.update(value=draft_md), docx_path # ----------------- GRADIO UI ----------------- with gr.Blocks() as demo: gr.Markdown("## ⚖️ AI Legal Draft Generator\nUpload or select a template, then enter case details.") case_text = gr.Textbox(label="Case Details", lines=10, placeholder="Enter client and case info...") uploaded_file = gr.File(label="Upload New Template (DOCX/PDF/TXT)", file_types=[".docx",".pdf",".txt"]) new_template_name = gr.Textbox(label="New Template Name (if uploading)") templates = load_templates_json() template_names = ["None"] + [t["name"] for t in templates] template_name = gr.Dropdown(choices=template_names, value="None", label="Select Existing Template") add_citations = gr.Checkbox(label="Append citations", value=True) draft_output = gr.Markdown(label="Draft Output") download_btn = gr.DownloadButton(label="⬇️ Download Word") btn = gr.Button("Generate Draft") btn.click( generate_legal_draft, inputs=[case_text, uploaded_file, template_name, new_template_name, add_citations], outputs=[draft_output, download_btn] ) if __name__ == "__main__": demo.launch()