Michtiii's picture
Upload app.py
11339a8 verified
"""
AI Document Screening Agent β€” Gradio App for Hugging Face Spaces
Author: Kajal Dadas | kajaldadas149@gmail.com
"""
import os
import re
import shutil
import zipfile
import tempfile
import faiss
import numpy as np
import pandas as pd
import gradio as gr
from sentence_transformers import SentenceTransformer
# ── Optional parsers ───────────────────────────────────────────────────────────
try:
from PyPDF2 import PdfReader
HAS_PDF = True
except ImportError:
HAS_PDF = False
try:
from docx import Document as DocxDocument
HAS_DOCX = True
except ImportError:
HAS_DOCX = False
try:
import pptx
HAS_PPTX = True
except ImportError:
HAS_PPTX = False
# ── Screened output folder ─────────────────────────────────────────────────────
SCREENED_FOLDER = "screened_documents"
os.makedirs(SCREENED_FOLDER, exist_ok=True)
# ── Model (cached) ─────────────────────────────────────────────────────────────
_model = None
def get_model():
global _model
if _model is None:
_model = SentenceTransformer("all-MiniLM-L6-v2")
return _model
# ── Text extraction ────────────────────────────────────────────────────────────
def extract_text(file_path: str) -> str:
ext = os.path.splitext(file_path)[-1].lower()
if ext == ".pdf":
if not HAS_PDF:
return ""
reader = PdfReader(file_path)
return " ".join(page.extract_text() or "" for page in reader.pages)
if ext == ".docx":
if not HAS_DOCX:
return ""
doc = DocxDocument(file_path)
return " ".join(p.text for p in doc.paragraphs)
if ext == ".pptx":
if not HAS_PPTX:
return ""
prs = pptx.Presentation(file_path)
texts = []
for slide in prs.slides:
for shape in slide.shapes:
if hasattr(shape, "text"):
texts.append(shape.text)
return " ".join(texts)
if ext == ".txt":
with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
return f.read()
return ""
# ── Keyword helpers ────────────────────────────────────────────────────────────
STOPWORDS = {
"with","and","the","for","are","you","will","have","this","that","from",
"our","your","about","who","their","them","into","such","also","not",
"but","can","all","has","its","was","were","been","more","than","when",
"which","these","those","some","what","very","just","over","then","each",
"much","well","need","must","use","may","any","new","per",
}
def extract_keywords(text: str) -> list:
words = re.findall(r'\b[a-zA-Z]{3,}\b', text.lower())
return list({w for w in words if w not in STOPWORDS})
# ── Scoring engine ─────────────────────────────────────────────────────────────
def score_documents(prompt: str, file_paths: list) -> pd.DataFrame:
model = get_model()
prompt_lower = prompt.lower()
keywords = extract_keywords(prompt_lower)
doc_texts, doc_names, doc_paths = [], [], []
for fp in file_paths:
doc_texts.append(extract_text(fp).lower())
doc_names.append(os.path.basename(fp))
doc_paths.append(fp)
prompt_emb = model.encode([prompt_lower])
doc_embs = model.encode(doc_texts)
index = faiss.IndexFlatL2(doc_embs.shape[1])
index.add(np.array(doc_embs, dtype=np.float32))
distances, indices = index.search(np.array(prompt_emb, dtype=np.float32), len(doc_names))
rows = []
for rank, idx in enumerate(indices[0]):
text = doc_texts[idx]
matches = sum(1 for k in keywords if k in text)
keyword_ratio = matches / max(len(keywords), 1)
sem_score = max(0.0, 100.0 - distances[0][rank] * 10)
final_score = min(sem_score, 20.0) if keyword_ratio < 0.05 else sem_score * keyword_ratio
rows.append({
"File Name": doc_names[idx],
"_path": doc_paths[idx],
"Keyword Matches": matches,
"Keyword Coverage %": round(keyword_ratio * 100, 1),
"Semantic Score": round(sem_score, 2),
"Final Score": round(final_score, 2),
})
df = pd.DataFrame(rows).sort_values("Final Score", ascending=False).reset_index(drop=True)
df.index += 1
df.index.name = "Rank"
return df
# ── ZIP builder ────────────────────────────────────────────────────────────────
def build_zip(paths: list) -> str:
zip_path = os.path.join(tempfile.gettempdir(), "screened_documents.zip")
with zipfile.ZipFile(zip_path, "w", zipfile.ZIP_DEFLATED) as zf:
for fp in paths:
zf.write(fp, arcname=os.path.basename(fp))
return zip_path
# ── Main handler ───────────────────────────────────────────────────────────────
def run_screening(prompt, files, top_n):
if not prompt or not prompt.strip():
return None, "⚠️ Enter a screening prompt first.", None
if not files:
return None, "⚠️ Upload at least one document.", None
try:
df = score_documents(prompt, [f.name for f in files])
except Exception as e:
return None, f"❌ Error: {e}", None
top_n = int(top_n)
top_df = df.head(top_n)
# ── Save top docs to screened_documents/ ──────────────────────────────────
shutil.rmtree(SCREENED_FOLDER, ignore_errors=True)
os.makedirs(SCREENED_FOLDER, exist_ok=True)
saved = []
for _, row in top_df.iterrows():
dest = os.path.join(SCREENED_FOLDER, row["File Name"])
shutil.copy2(row["_path"], dest)
saved.append(dest)
zip_path = build_zip(saved)
display_df = top_df.drop(columns=["_path"]).reset_index()
# ── Summary text ──────────────────────────────────────────────────────────
lines = [f"βœ… **{len(files)} document(s) screened** Β· Top **{top_n}** saved to `screened_documents/`\n"]
for _, row in top_df.iterrows():
filled = int(row["Final Score"] / 100 * 20)
bar = "β–ˆ" * filled + "β–‘" * (20 - filled)
lines.append(
f"**{row['File Name']}**\n"
f"`{bar}` {row['Final Score']} "
f"| Keywords: {row['Keyword Matches']} | Semantic: {row['Semantic Score']}"
)
return display_df, "\n\n".join(lines), zip_path
# ── Gradio UI ──────────────────────────────────────────────────────────────────
with gr.Blocks(
title="AI Document Screening Agent",
theme=gr.themes.Soft(
primary_hue="purple",
secondary_hue="indigo",
neutral_hue="slate",
font=[gr.themes.GoogleFont("DM Sans"), "sans-serif"],
),
css="""
#banner {
background: linear-gradient(135deg, #6d28d9, #4f46e5);
border-radius: 12px;
padding: 20px 28px;
color: white;
margin-bottom: 4px;
}
#banner h1 { margin: 0; font-size: 1.8rem; font-weight: 800; }
#banner p { margin: 4px 0 0; opacity: 0.8; font-size: 0.9rem; }
footer { display: none !important; }
""",
) as demo:
gr.HTML("""
<div id="banner">
<h1>πŸ€– AI Document Screening Agent</h1>
<p>Semantic AI + Keyword matching Β· PDF Β· DOCX Β· PPTX Β· TXT</p>
</div>
""")
# ── Inputs ─────────────────────────────────────────────────────────────────
with gr.Row():
with gr.Column(scale=2):
prompt_box = gr.Textbox(
label="Screening Prompt",
placeholder="Describe what you are looking for in these documents...",
lines=5,
)
with gr.Row():
top_n_slider = gr.Slider(1, 20, value=5, step=1, label="Top N to screen")
screen_btn = gr.Button("πŸ” Run Screening", variant="primary")
with gr.Column(scale=1):
file_upload = gr.File(
label="Upload Documents",
file_types=[".pdf", ".docx", ".pptx", ".txt"],
file_count="multiple",
height=220,
)
# ── Results ────────────────────────────────────────────────────────────────
with gr.Row():
with gr.Column(scale=3):
result_table = gr.Dataframe(
label="πŸ“Š Scoreboard",
interactive=False,
wrap=True,
)
with gr.Column(scale=2):
summary_md = gr.Markdown("*Results will appear here after screening.*")
# ── Download ───────────────────────────────────────────────────────────────
download_file = gr.File(
label="⬇️ Download Screened Documents (ZIP)",
interactive=False,
)
gr.Markdown(
"---\n"
"**Scoring:** `Final Score = Semantic Score Γ— Keyword Coverage`"
" β€” docs with < 5% keyword overlap are capped at 20. \n"
"*Author: Kajal Dadas Β· kajaldadas149@gmail.com*"
)
screen_btn.click(
fn=run_screening,
inputs=[prompt_box, file_upload, top_n_slider],
outputs=[result_table, summary_md, download_file],
)
if __name__ == "__main__":
demo.launch()