Spaces:
Running
Running
| import json | |
| import gradio as gr | |
| from chunker import extract_text_from_file, chunk_text | |
| from reranker import CrossEncoderRanker | |
| from inference import answer_topk_longformer | |
| import os | |
| DEFAULT_RANKER = os.environ.get("RERANKER_MODEL", "cross-encoder/ms-marco-MiniLM-L-6-v2") | |
| with open("questions_clauses.json", "r", encoding="utf-8") as f: | |
| clauses_data = json.load(f) | |
| CLAUSES_MAP = {item["question_id_text"]: item["question"] for item in clauses_data} | |
| CLAUSES_LABELS = list(CLAUSES_MAP.keys()) | |
| ranker = CrossEncoderRanker(model_name=DEFAULT_RANKER) | |
| # Global state to store chunks between steps | |
| _chunks_cache = [] | |
| def chunk_document(file, progress=gr.Progress(track_tqdm=True)): | |
| global _chunks_cache | |
| if file is None: | |
| return "β οΈ Please upload a contract file.", "" | |
| try: | |
| progress(0.2, desc="π Extracting text...") | |
| text = extract_text_from_file(file.name) | |
| if text.startswith("ERROR:"): | |
| return f"β {text}", "" | |
| progress(0.6, desc="βοΈ Chunking document...") | |
| _chunks_cache = chunk_text(text) | |
| total_chunks = len(_chunks_cache) | |
| progress(1.0, desc="β Done!") | |
| # preview = "\n\n---\n\n".join( | |
| # [f"**Chunk {i+1}:**\n{c}" for i, c in enumerate(_chunks_cache[:5])] | |
| # ) | |
| # summary = f"β Document chunked into **{total_chunks} chunks**. Showing first 5 below.\n\n{preview}" | |
| summary = f"β Document chunked into **{total_chunks} chunks**. Showing first 5 below.\n\n" | |
| return f"β Ready β {total_chunks} chunks created. Now select a question and click Analyze.", summary | |
| except Exception as e: | |
| import traceback | |
| return f"β Error:\n```\n{traceback.format_exc()}\n```", "" | |
| def run_pipeline(question_label, search_mode, top_k_chunks, top_k_answers, progress=gr.Progress(track_tqdm=True)): | |
| global _chunks_cache | |
| if not _chunks_cache: | |
| yield "β οΈ Please upload and chunk a document first.", "", "" | |
| return | |
| question = CLAUSES_MAP[question_label] | |
| try: | |
| total_chunks = len(_chunks_cache) | |
| if search_mode == "Top-K chunks (reranked)": | |
| yield f"π Reranking {total_chunks} chunks...", "", "" | |
| progress(0.3, desc="π Reranking chunks...") | |
| ranked = ranker.rank(question, _chunks_cache, top_k=int(top_k_chunks)) | |
| selected_chunks = [chunk for chunk, score in ranked] | |
| else: | |
| yield f"π Using all {total_chunks} chunks...", "", "" | |
| progress(0.3, desc="π Using all chunks...") | |
| selected_chunks = _chunks_cache | |
| chunks_display = f"π **Total chunks:** {total_chunks} | **Using:** {len(selected_chunks)}\n\n" | |
| chunks_display += "\n\n---\n\n".join( | |
| [f"**Chunk {i+1}:**\n{c}" for i, c in enumerate(selected_chunks)] | |
| ) | |
| yield "π€ Model is analyzing the document... (this may take 30β60s on first run)", chunks_display, "" | |
| progress(0.65, desc="π€ Running QA model...") | |
| answers = answer_topk_longformer(question, selected_chunks, top_k=int(top_k_answers)) | |
| progress(0.95, desc="β Formatting results...") | |
| if not answers: | |
| yield "β οΈ No answers found. Try different chunks or question.", chunks_display, "" | |
| return | |
| answers_display = f"### β Question\n> {question}\n\n---\n\n### π Top Answers\n\n" | |
| for i, (ans, score) in enumerate(answers, 1): | |
| answers_display += f"**Answer {i}** β Score: `{score:.4f}`\n\n{ans}\n\n{'β'*50}\n\n" | |
| progress(1.0, desc="Done!") | |
| yield "β Done!", chunks_display, answers_display | |
| except Exception as e: | |
| import traceback | |
| yield f"β Error:\n```\n{traceback.format_exc()}\n```", "", "" | |
| with gr.Blocks(title="iContract QA") as demo: | |
| gr.Markdown("# π iContract β Legal Contract QA") | |
| gr.Markdown("β οΈ **First analysis may take ~30β60 seconds** as the QA model (~600MB) loads for the first time.") | |
| # ββ Step 1 ββββββββββββββββββββββββββββββββββββββββββ | |
| gr.Markdown("## Step 1 β Upload & Chunk Document") | |
| with gr.Row(): | |
| file_input = gr.File( | |
| label="Upload Contract (PDF or TXT)", | |
| file_types=[".pdf", ".PDF", ".txt"] | |
| ) | |
| chunk_btn = gr.Button("βοΈ Chunk Document", variant="secondary") | |
| chunk_status = gr.Textbox(label="β³ Status", interactive=False, value="Ready β upload a contract and chunk it.") | |
| chunk_preview = gr.Markdown() | |
| # ββ Step 2 ββββββββββββββββββββββββββββββββββββββββββ | |
| gr.Markdown("## Step 2 β Analyze") | |
| with gr.Row(): | |
| question_select = gr.Dropdown( | |
| choices=CLAUSES_LABELS, | |
| value=CLAUSES_LABELS[0], | |
| label="Select a clause to analyze" | |
| ) | |
| with gr.Row(): | |
| search_mode = gr.Radio( | |
| choices=["Top-K chunks (reranked)", "All chunks"], | |
| value="Top-K chunks (reranked)", | |
| label="Search Mode" | |
| ) | |
| with gr.Row(): | |
| top_k_chunks = gr.Slider(1, 20, value=5, step=1, label="Top-K Chunks (reranker)") | |
| top_k_answers = gr.Slider(1, 10, value=3, step=1, label="Top-K Answers") | |
| analyze_btn = gr.Button("π Analyze", variant="primary") | |
| analyze_status = gr.Textbox(label="β³ Status", interactive=False, value="") | |
| with gr.Tabs(): | |
| with gr.Tab("π Selected Chunks"): | |
| chunks_out = gr.Markdown() | |
| with gr.Tab("β Answers"): | |
| answers_out = gr.Markdown() | |
| chunk_btn.click( | |
| fn=chunk_document, | |
| inputs=[file_input], | |
| outputs=[chunk_status, chunk_preview], | |
| ) | |
| analyze_btn.click( | |
| fn=run_pipeline, | |
| inputs=[question_select, search_mode, top_k_chunks, top_k_answers], | |
| outputs=[analyze_status, chunks_out, answers_out], | |
| ) | |
| demo.launch(server_name="0.0.0.0") |