Spaces:

Michtiii
/

AI_Dcouments_Screening_Agent

Sleeping

App Files Files Community

AI_Dcouments_Screening_Agent / app.py

Michtiii

Upload app.py

11339a8 verified 23 days ago

raw

history blame contribute delete

10.8 kB

	"""
	AI Document Screening Agent — Gradio App for Hugging Face Spaces
	Author: Kajal Dadas \| kajaldadas149@gmail.com
	"""

	import os
	import re
	import shutil
	import zipfile
	import tempfile

	import faiss
	import numpy as np
	import pandas as pd
	import gradio as gr
	from sentence_transformers import SentenceTransformer

	# ── Optional parsers ───────────────────────────────────────────────────────────
	try:
	from PyPDF2 import PdfReader
	HAS_PDF = True
	except ImportError:
	HAS_PDF = False

	try:
	from docx import Document as DocxDocument
	HAS_DOCX = True
	except ImportError:
	HAS_DOCX = False

	try:
	import pptx
	HAS_PPTX = True
	except ImportError:
	HAS_PPTX = False

	# ── Screened output folder ─────────────────────────────────────────────────────
	SCREENED_FOLDER = "screened_documents"
	os.makedirs(SCREENED_FOLDER, exist_ok=True)

	# ── Model (cached) ─────────────────────────────────────────────────────────────
	_model = None

	def get_model():
	global _model
	if _model is None:
	_model = SentenceTransformer("all-MiniLM-L6-v2")
	return _model

	# ── Text extraction ────────────────────────────────────────────────────────────
	def extract_text(file_path: str) -> str:
	ext = os.path.splitext(file_path)[-1].lower()

	if ext == ".pdf":
	if not HAS_PDF:
	return ""
	reader = PdfReader(file_path)
	return " ".join(page.extract_text() or "" for page in reader.pages)

	if ext == ".docx":
	if not HAS_DOCX:
	return ""
	doc = DocxDocument(file_path)
	return " ".join(p.text for p in doc.paragraphs)

	if ext == ".pptx":
	if not HAS_PPTX:
	return ""
	prs = pptx.Presentation(file_path)
	texts = []
	for slide in prs.slides:
	for shape in slide.shapes:
	if hasattr(shape, "text"):
	texts.append(shape.text)
	return " ".join(texts)

	if ext == ".txt":
	with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
	return f.read()

	return ""

	# ── Keyword helpers ────────────────────────────────────────────────────────────
	STOPWORDS = {
	"with","and","the","for","are","you","will","have","this","that","from",
	"our","your","about","who","their","them","into","such","also","not",
	"but","can","all","has","its","was","were","been","more","than","when",
	"which","these","those","some","what","very","just","over","then","each",
	"much","well","need","must","use","may","any","new","per",
	}

	def extract_keywords(text: str) -> list:
	words = re.findall(r'\b[a-zA-Z]{3,}\b', text.lower())
	return list({w for w in words if w not in STOPWORDS})

	# ── Scoring engine ─────────────────────────────────────────────────────────────
	def score_documents(prompt: str, file_paths: list) -> pd.DataFrame:
	model = get_model()
	prompt_lower = prompt.lower()
	keywords = extract_keywords(prompt_lower)

	doc_texts, doc_names, doc_paths = [], [], []
	for fp in file_paths:
	doc_texts.append(extract_text(fp).lower())
	doc_names.append(os.path.basename(fp))
	doc_paths.append(fp)

	prompt_emb = model.encode([prompt_lower])
	doc_embs = model.encode(doc_texts)

	index = faiss.IndexFlatL2(doc_embs.shape[1])
	index.add(np.array(doc_embs, dtype=np.float32))
	distances, indices = index.search(np.array(prompt_emb, dtype=np.float32), len(doc_names))

	rows = []
	for rank, idx in enumerate(indices[0]):
	text = doc_texts[idx]
	matches = sum(1 for k in keywords if k in text)
	keyword_ratio = matches / max(len(keywords), 1)
	sem_score = max(0.0, 100.0 - distances[0][rank] * 10)
	final_score = min(sem_score, 20.0) if keyword_ratio < 0.05 else sem_score * keyword_ratio

	rows.append({
	"File Name": doc_names[idx],
	"_path": doc_paths[idx],
	"Keyword Matches": matches,
	"Keyword Coverage %": round(keyword_ratio * 100, 1),
	"Semantic Score": round(sem_score, 2),
	"Final Score": round(final_score, 2),
	})

	df = pd.DataFrame(rows).sort_values("Final Score", ascending=False).reset_index(drop=True)
	df.index += 1
	df.index.name = "Rank"
	return df

	# ── ZIP builder ────────────────────────────────────────────────────────────────
	def build_zip(paths: list) -> str:
	zip_path = os.path.join(tempfile.gettempdir(), "screened_documents.zip")
	with zipfile.ZipFile(zip_path, "w", zipfile.ZIP_DEFLATED) as zf:
	for fp in paths:
	zf.write(fp, arcname=os.path.basename(fp))
	return zip_path

	# ── Main handler ───────────────────────────────────────────────────────────────
	def run_screening(prompt, files, top_n):
	if not prompt or not prompt.strip():
	return None, "⚠️ Enter a screening prompt first.", None
	if not files:
	return None, "⚠️ Upload at least one document.", None

	try:
	df = score_documents(prompt, [f.name for f in files])
	except Exception as e:
	return None, f"❌ Error: {e}", None

	top_n = int(top_n)
	top_df = df.head(top_n)

	# ── Save top docs to screened_documents/ ──────────────────────────────────
	shutil.rmtree(SCREENED_FOLDER, ignore_errors=True)
	os.makedirs(SCREENED_FOLDER, exist_ok=True)

	saved = []
	for _, row in top_df.iterrows():
	dest = os.path.join(SCREENED_FOLDER, row["File Name"])
	shutil.copy2(row["_path"], dest)
	saved.append(dest)

	zip_path = build_zip(saved)
	display_df = top_df.drop(columns=["_path"]).reset_index()

	# ── Summary text ──────────────────────────────────────────────────────────
	lines = [f"✅ {len(files)} document(s) screened · Top {top_n} saved to `screened_documents/`\n"]
	for _, row in top_df.iterrows():
	filled = int(row["Final Score"] / 100 * 20)
	bar = "█" * filled + "░" * (20 - filled)
	lines.append(
	f"{row['File Name']}\n"
	f"`{bar}` {row['Final Score']} "
	f"\| Keywords: {row['Keyword Matches']} \| Semantic: {row['Semantic Score']}"
	)

	return display_df, "\n\n".join(lines), zip_path

	# ── Gradio UI ──────────────────────────────────────────────────────────────────
	with gr.Blocks(
	title="AI Document Screening Agent",
	theme=gr.themes.Soft(
	primary_hue="purple",
	secondary_hue="indigo",
	neutral_hue="slate",
	font=[gr.themes.GoogleFont("DM Sans"), "sans-serif"],
	),
	css="""
	#banner {
	background: linear-gradient(135deg, #6d28d9, #4f46e5);
	border-radius: 12px;
	padding: 20px 28px;
	color: white;
	margin-bottom: 4px;
	}
	#banner h1 { margin: 0; font-size: 1.8rem; font-weight: 800; }
	#banner p { margin: 4px 0 0; opacity: 0.8; font-size: 0.9rem; }
	footer { display: none !important; }
	""",
	) as demo:

	gr.HTML("""
	<div id="banner">
	<h1>🤖 AI Document Screening Agent</h1>
	<p>Semantic AI + Keyword matching · PDF · DOCX · PPTX · TXT</p>
	</div>
	""")

	# ── Inputs ─────────────────────────────────────────────────────────────────
	with gr.Row():
	with gr.Column(scale=2):
	prompt_box = gr.Textbox(
	label="Screening Prompt",
	placeholder="Describe what you are looking for in these documents...",
	lines=5,
	)
	with gr.Row():
	top_n_slider = gr.Slider(1, 20, value=5, step=1, label="Top N to screen")
	screen_btn = gr.Button("🔍 Run Screening", variant="primary")

	with gr.Column(scale=1):
	file_upload = gr.File(
	label="Upload Documents",
	file_types=[".pdf", ".docx", ".pptx", ".txt"],
	file_count="multiple",
	height=220,
	)

	# ── Results ────────────────────────────────────────────────────────────────
	with gr.Row():
	with gr.Column(scale=3):
	result_table = gr.Dataframe(
	label="📊 Scoreboard",
	interactive=False,
	wrap=True,
	)
	with gr.Column(scale=2):
	summary_md = gr.Markdown("Results will appear here after screening.")

	# ── Download ───────────────────────────────────────────────────────────────
	download_file = gr.File(
	label="⬇️ Download Screened Documents (ZIP)",
	interactive=False,
	)

	gr.Markdown(
	"---\n"
	"Scoring: `Final Score = Semantic Score × Keyword Coverage`"
	" — docs with < 5% keyword overlap are capped at 20. \n"
	"Author: Kajal Dadas · kajaldadas149@gmail.com"
	)

	screen_btn.click(
	fn=run_screening,
	inputs=[prompt_box, file_upload, top_n_slider],
	outputs=[result_table, summary_md, download_file],
	)

	if __name__ == "__main__":
	demo.launch()