Spaces:

jira877832
/

ai

Running

App Files Files Community

ai / app.py

jira877832

Update app.py

0d179fc verified about 1 month ago

raw

history blame contribute delete

6.06 kB

	import json
	import gradio as gr
	from chunker import extract_text_from_file, chunk_text
	from reranker import CrossEncoderRanker
	from inference import answer_topk_longformer
	import os

	DEFAULT_RANKER = os.environ.get("RERANKER_MODEL", "cross-encoder/ms-marco-MiniLM-L-6-v2")

	with open("questions_clauses.json", "r", encoding="utf-8") as f:
	clauses_data = json.load(f)

	CLAUSES_MAP = {item["question_id_text"]: item["question"] for item in clauses_data}
	CLAUSES_LABELS = list(CLAUSES_MAP.keys())

	ranker = CrossEncoderRanker(model_name=DEFAULT_RANKER)

	# Global state to store chunks between steps
	_chunks_cache = []


	def chunk_document(file, progress=gr.Progress(track_tqdm=True)):
	global _chunks_cache

	if file is None:
	return "⚠️ Please upload a contract file.", ""

	try:
	progress(0.2, desc="📄 Extracting text...")
	text = extract_text_from_file(file.name)

	if text.startswith("ERROR:"):
	return f"❌ {text}", ""

	progress(0.6, desc="✂️ Chunking document...")
	_chunks_cache = chunk_text(text)
	total_chunks = len(_chunks_cache)

	progress(1.0, desc="✅ Done!")
	# preview = "\n\n---\n\n".join(
	# [f"Chunk {i+1}:\n{c}" for i, c in enumerate(_chunks_cache[:5])]
	# )
	# summary = f"✅ Document chunked into {total_chunks} chunks. Showing first 5 below.\n\n{preview}"
	summary = f"✅ Document chunked into {total_chunks} chunks. Showing first 5 below.\n\n"
	return f"✅ Ready — {total_chunks} chunks created. Now select a question and click Analyze.", summary

	except Exception as e:
	import traceback
	return f"❌ Error:\n```\n{traceback.format_exc()}\n```", ""


	def run_pipeline(question_label, search_mode, top_k_chunks, top_k_answers, progress=gr.Progress(track_tqdm=True)):
	global _chunks_cache

	if not _chunks_cache:
	yield "⚠️ Please upload and chunk a document first.", "", ""
	return

	question = CLAUSES_MAP[question_label]

	try:
	total_chunks = len(_chunks_cache)

	if search_mode == "Top-K chunks (reranked)":
	yield f"🔍 Reranking {total_chunks} chunks...", "", ""
	progress(0.3, desc="🔍 Reranking chunks...")
	ranked = ranker.rank(question, _chunks_cache, top_k=int(top_k_chunks))
	selected_chunks = [chunk for chunk, score in ranked]
	else:
	yield f"📃 Using all {total_chunks} chunks...", "", ""
	progress(0.3, desc="📃 Using all chunks...")
	selected_chunks = _chunks_cache

	chunks_display = f"📄 Total chunks: {total_chunks} \| Using: {len(selected_chunks)}\n\n"
	chunks_display += "\n\n---\n\n".join(
	[f"Chunk {i+1}:\n{c}" for i, c in enumerate(selected_chunks)]
	)

	yield "🤖 Model is analyzing the document... (this may take 30–60s on first run)", chunks_display, ""
	progress(0.65, desc="🤖 Running QA model...")
	answers = answer_topk_longformer(question, selected_chunks, top_k=int(top_k_answers))

	progress(0.95, desc="✅ Formatting results...")
	if not answers:
	yield "⚠️ No answers found. Try different chunks or question.", chunks_display, ""
	return

	answers_display = f"### ❓ Question\n> {question}\n\n---\n\n### 📋 Top Answers\n\n"
	for i, (ans, score) in enumerate(answers, 1):
	answers_display += f"Answer {i} — Score: `{score:.4f}`\n\n{ans}\n\n{'—'*50}\n\n"

	progress(1.0, desc="Done!")
	yield "✅ Done!", chunks_display, answers_display

	except Exception as e:
	import traceback
	yield f"❌ Error:\n```\n{traceback.format_exc()}\n```", "", ""


	with gr.Blocks(title="iContract QA") as demo:
	gr.Markdown("# 📑 iContract — Legal Contract QA")
	gr.Markdown("⚠️ First analysis may take ~30–60 seconds as the QA model (~600MB) loads for the first time.")

	# ── Step 1 ──────────────────────────────────────────
	gr.Markdown("## Step 1 — Upload & Chunk Document")
	with gr.Row():
	file_input = gr.File(
	label="Upload Contract (PDF or TXT)",
	file_types=[".pdf", ".PDF", ".txt"]
	)
	chunk_btn = gr.Button("✂️ Chunk Document", variant="secondary")
	chunk_status = gr.Textbox(label="⏳ Status", interactive=False, value="Ready — upload a contract and chunk it.")
	chunk_preview = gr.Markdown()

	# ── Step 2 ──────────────────────────────────────────
	gr.Markdown("## Step 2 — Analyze")
	with gr.Row():
	question_select = gr.Dropdown(
	choices=CLAUSES_LABELS,
	value=CLAUSES_LABELS[0],
	label="Select a clause to analyze"
	)
	with gr.Row():
	search_mode = gr.Radio(
	choices=["Top-K chunks (reranked)", "All chunks"],
	value="Top-K chunks (reranked)",
	label="Search Mode"
	)
	with gr.Row():
	top_k_chunks = gr.Slider(1, 20, value=5, step=1, label="Top-K Chunks (reranker)")
	top_k_answers = gr.Slider(1, 10, value=3, step=1, label="Top-K Answers")

	analyze_btn = gr.Button("🔍 Analyze", variant="primary")
	analyze_status = gr.Textbox(label="⏳ Status", interactive=False, value="")

	with gr.Tabs():
	with gr.Tab("📄 Selected Chunks"):
	chunks_out = gr.Markdown()
	with gr.Tab("✅ Answers"):
	answers_out = gr.Markdown()

	chunk_btn.click(
	fn=chunk_document,
	inputs=[file_input],
	outputs=[chunk_status, chunk_preview],
	)

	analyze_btn.click(
	fn=run_pipeline,
	inputs=[question_select, search_mode, top_k_chunks, top_k_answers],
	outputs=[analyze_status, chunks_out, answers_out],
	)

	demo.launch(server_name="0.0.0.0")