import json import gradio as gr from chunker import extract_text_from_file, chunk_text from reranker import CrossEncoderRanker from inference import answer_topk_longformer import os DEFAULT_RANKER = os.environ.get("RERANKER_MODEL", "cross-encoder/ms-marco-MiniLM-L-6-v2") with open("questions_clauses.json", "r", encoding="utf-8") as f: clauses_data = json.load(f) CLAUSES_MAP = {item["question_id_text"]: item["question"] for item in clauses_data} CLAUSES_LABELS = list(CLAUSES_MAP.keys()) ranker = CrossEncoderRanker(model_name=DEFAULT_RANKER) # Global state to store chunks between steps _chunks_cache = [] def chunk_document(file, progress=gr.Progress(track_tqdm=True)): global _chunks_cache if file is None: return "⚠️ Please upload a contract file.", "" try: progress(0.2, desc="📄 Extracting text...") text = extract_text_from_file(file.name) if text.startswith("ERROR:"): return f"❌ {text}", "" progress(0.6, desc="✂️ Chunking document...") _chunks_cache = chunk_text(text) total_chunks = len(_chunks_cache) progress(1.0, desc="✅ Done!") # preview = "\n\n---\n\n".join( # [f"**Chunk {i+1}:**\n{c}" for i, c in enumerate(_chunks_cache[:5])] # ) # summary = f"✅ Document chunked into **{total_chunks} chunks**. Showing first 5 below.\n\n{preview}" summary = f"✅ Document chunked into **{total_chunks} chunks**. Showing first 5 below.\n\n" return f"✅ Ready — {total_chunks} chunks created. Now select a question and click Analyze.", summary except Exception as e: import traceback return f"❌ Error:\n```\n{traceback.format_exc()}\n```", "" def run_pipeline(question_label, search_mode, top_k_chunks, top_k_answers, progress=gr.Progress(track_tqdm=True)): global _chunks_cache if not _chunks_cache: yield "⚠️ Please upload and chunk a document first.", "", "" return question = CLAUSES_MAP[question_label] try: total_chunks = len(_chunks_cache) if search_mode == "Top-K chunks (reranked)": yield f"🔍 Reranking {total_chunks} chunks...", "", "" progress(0.3, desc="🔍 Reranking chunks...") ranked = ranker.rank(question, _chunks_cache, top_k=int(top_k_chunks)) selected_chunks = [chunk for chunk, score in ranked] else: yield f"📃 Using all {total_chunks} chunks...", "", "" progress(0.3, desc="📃 Using all chunks...") selected_chunks = _chunks_cache chunks_display = f"📄 **Total chunks:** {total_chunks} | **Using:** {len(selected_chunks)}\n\n" chunks_display += "\n\n---\n\n".join( [f"**Chunk {i+1}:**\n{c}" for i, c in enumerate(selected_chunks)] ) yield "🤖 Model is analyzing the document... (this may take 30–60s on first run)", chunks_display, "" progress(0.65, desc="🤖 Running QA model...") answers = answer_topk_longformer(question, selected_chunks, top_k=int(top_k_answers)) progress(0.95, desc="✅ Formatting results...") if not answers: yield "⚠️ No answers found. Try different chunks or question.", chunks_display, "" return answers_display = f"### ❓ Question\n> {question}\n\n---\n\n### 📋 Top Answers\n\n" for i, (ans, score) in enumerate(answers, 1): answers_display += f"**Answer {i}** — Score: `{score:.4f}`\n\n{ans}\n\n{'—'*50}\n\n" progress(1.0, desc="Done!") yield "✅ Done!", chunks_display, answers_display except Exception as e: import traceback yield f"❌ Error:\n```\n{traceback.format_exc()}\n```", "", "" with gr.Blocks(title="iContract QA") as demo: gr.Markdown("# 📑 iContract — Legal Contract QA") gr.Markdown("⚠️ **First analysis may take ~30–60 seconds** as the QA model (~600MB) loads for the first time.") # ── Step 1 ────────────────────────────────────────── gr.Markdown("## Step 1 — Upload & Chunk Document") with gr.Row(): file_input = gr.File( label="Upload Contract (PDF or TXT)", file_types=[".pdf", ".PDF", ".txt"] ) chunk_btn = gr.Button("✂️ Chunk Document", variant="secondary") chunk_status = gr.Textbox(label="⏳ Status", interactive=False, value="Ready — upload a contract and chunk it.") chunk_preview = gr.Markdown() # ── Step 2 ────────────────────────────────────────── gr.Markdown("## Step 2 — Analyze") with gr.Row(): question_select = gr.Dropdown( choices=CLAUSES_LABELS, value=CLAUSES_LABELS[0], label="Select a clause to analyze" ) with gr.Row(): search_mode = gr.Radio( choices=["Top-K chunks (reranked)", "All chunks"], value="Top-K chunks (reranked)", label="Search Mode" ) with gr.Row(): top_k_chunks = gr.Slider(1, 20, value=5, step=1, label="Top-K Chunks (reranker)") top_k_answers = gr.Slider(1, 10, value=3, step=1, label="Top-K Answers") analyze_btn = gr.Button("🔍 Analyze", variant="primary") analyze_status = gr.Textbox(label="⏳ Status", interactive=False, value="") with gr.Tabs(): with gr.Tab("📄 Selected Chunks"): chunks_out = gr.Markdown() with gr.Tab("✅ Answers"): answers_out = gr.Markdown() chunk_btn.click( fn=chunk_document, inputs=[file_input], outputs=[chunk_status, chunk_preview], ) analyze_btn.click( fn=run_pipeline, inputs=[question_select, search_mode, top_k_chunks, top_k_answers], outputs=[analyze_status, chunks_out, answers_out], ) demo.launch(server_name="0.0.0.0")