ai / app.py
jira877832's picture
Update app.py
0d179fc verified
import json
import gradio as gr
from chunker import extract_text_from_file, chunk_text
from reranker import CrossEncoderRanker
from inference import answer_topk_longformer
import os
DEFAULT_RANKER = os.environ.get("RERANKER_MODEL", "cross-encoder/ms-marco-MiniLM-L-6-v2")
with open("questions_clauses.json", "r", encoding="utf-8") as f:
clauses_data = json.load(f)
CLAUSES_MAP = {item["question_id_text"]: item["question"] for item in clauses_data}
CLAUSES_LABELS = list(CLAUSES_MAP.keys())
ranker = CrossEncoderRanker(model_name=DEFAULT_RANKER)
# Global state to store chunks between steps
_chunks_cache = []
def chunk_document(file, progress=gr.Progress(track_tqdm=True)):
global _chunks_cache
if file is None:
return "⚠️ Please upload a contract file.", ""
try:
progress(0.2, desc="πŸ“„ Extracting text...")
text = extract_text_from_file(file.name)
if text.startswith("ERROR:"):
return f"❌ {text}", ""
progress(0.6, desc="βœ‚οΈ Chunking document...")
_chunks_cache = chunk_text(text)
total_chunks = len(_chunks_cache)
progress(1.0, desc="βœ… Done!")
# preview = "\n\n---\n\n".join(
# [f"**Chunk {i+1}:**\n{c}" for i, c in enumerate(_chunks_cache[:5])]
# )
# summary = f"βœ… Document chunked into **{total_chunks} chunks**. Showing first 5 below.\n\n{preview}"
summary = f"βœ… Document chunked into **{total_chunks} chunks**. Showing first 5 below.\n\n"
return f"βœ… Ready β€” {total_chunks} chunks created. Now select a question and click Analyze.", summary
except Exception as e:
import traceback
return f"❌ Error:\n```\n{traceback.format_exc()}\n```", ""
def run_pipeline(question_label, search_mode, top_k_chunks, top_k_answers, progress=gr.Progress(track_tqdm=True)):
global _chunks_cache
if not _chunks_cache:
yield "⚠️ Please upload and chunk a document first.", "", ""
return
question = CLAUSES_MAP[question_label]
try:
total_chunks = len(_chunks_cache)
if search_mode == "Top-K chunks (reranked)":
yield f"πŸ” Reranking {total_chunks} chunks...", "", ""
progress(0.3, desc="πŸ” Reranking chunks...")
ranked = ranker.rank(question, _chunks_cache, top_k=int(top_k_chunks))
selected_chunks = [chunk for chunk, score in ranked]
else:
yield f"πŸ“ƒ Using all {total_chunks} chunks...", "", ""
progress(0.3, desc="πŸ“ƒ Using all chunks...")
selected_chunks = _chunks_cache
chunks_display = f"πŸ“„ **Total chunks:** {total_chunks} | **Using:** {len(selected_chunks)}\n\n"
chunks_display += "\n\n---\n\n".join(
[f"**Chunk {i+1}:**\n{c}" for i, c in enumerate(selected_chunks)]
)
yield "πŸ€– Model is analyzing the document... (this may take 30–60s on first run)", chunks_display, ""
progress(0.65, desc="πŸ€– Running QA model...")
answers = answer_topk_longformer(question, selected_chunks, top_k=int(top_k_answers))
progress(0.95, desc="βœ… Formatting results...")
if not answers:
yield "⚠️ No answers found. Try different chunks or question.", chunks_display, ""
return
answers_display = f"### ❓ Question\n> {question}\n\n---\n\n### πŸ“‹ Top Answers\n\n"
for i, (ans, score) in enumerate(answers, 1):
answers_display += f"**Answer {i}** β€” Score: `{score:.4f}`\n\n{ans}\n\n{'β€”'*50}\n\n"
progress(1.0, desc="Done!")
yield "βœ… Done!", chunks_display, answers_display
except Exception as e:
import traceback
yield f"❌ Error:\n```\n{traceback.format_exc()}\n```", "", ""
with gr.Blocks(title="iContract QA") as demo:
gr.Markdown("# πŸ“‘ iContract β€” Legal Contract QA")
gr.Markdown("⚠️ **First analysis may take ~30–60 seconds** as the QA model (~600MB) loads for the first time.")
# ── Step 1 ──────────────────────────────────────────
gr.Markdown("## Step 1 β€” Upload & Chunk Document")
with gr.Row():
file_input = gr.File(
label="Upload Contract (PDF or TXT)",
file_types=[".pdf", ".PDF", ".txt"]
)
chunk_btn = gr.Button("βœ‚οΈ Chunk Document", variant="secondary")
chunk_status = gr.Textbox(label="⏳ Status", interactive=False, value="Ready β€” upload a contract and chunk it.")
chunk_preview = gr.Markdown()
# ── Step 2 ──────────────────────────────────────────
gr.Markdown("## Step 2 β€” Analyze")
with gr.Row():
question_select = gr.Dropdown(
choices=CLAUSES_LABELS,
value=CLAUSES_LABELS[0],
label="Select a clause to analyze"
)
with gr.Row():
search_mode = gr.Radio(
choices=["Top-K chunks (reranked)", "All chunks"],
value="Top-K chunks (reranked)",
label="Search Mode"
)
with gr.Row():
top_k_chunks = gr.Slider(1, 20, value=5, step=1, label="Top-K Chunks (reranker)")
top_k_answers = gr.Slider(1, 10, value=3, step=1, label="Top-K Answers")
analyze_btn = gr.Button("πŸ” Analyze", variant="primary")
analyze_status = gr.Textbox(label="⏳ Status", interactive=False, value="")
with gr.Tabs():
with gr.Tab("πŸ“„ Selected Chunks"):
chunks_out = gr.Markdown()
with gr.Tab("βœ… Answers"):
answers_out = gr.Markdown()
chunk_btn.click(
fn=chunk_document,
inputs=[file_input],
outputs=[chunk_status, chunk_preview],
)
analyze_btn.click(
fn=run_pipeline,
inputs=[question_select, search_mode, top_k_chunks, top_k_answers],
outputs=[analyze_status, chunks_out, answers_out],
)
demo.launch(server_name="0.0.0.0")