| """ |
| AI Document Screening Agent β Gradio App for Hugging Face Spaces |
| Author: Kajal Dadas | kajaldadas149@gmail.com |
| """ |
|
|
| import os |
| import re |
| import shutil |
| import zipfile |
| import tempfile |
|
|
| import faiss |
| import numpy as np |
| import pandas as pd |
| import gradio as gr |
| from sentence_transformers import SentenceTransformer |
|
|
| |
| try: |
| from PyPDF2 import PdfReader |
| HAS_PDF = True |
| except ImportError: |
| HAS_PDF = False |
|
|
| try: |
| from docx import Document as DocxDocument |
| HAS_DOCX = True |
| except ImportError: |
| HAS_DOCX = False |
|
|
| try: |
| import pptx |
| HAS_PPTX = True |
| except ImportError: |
| HAS_PPTX = False |
|
|
| |
| SCREENED_FOLDER = "screened_documents" |
| os.makedirs(SCREENED_FOLDER, exist_ok=True) |
|
|
| |
| _model = None |
|
|
| def get_model(): |
| global _model |
| if _model is None: |
| _model = SentenceTransformer("all-MiniLM-L6-v2") |
| return _model |
|
|
| |
| def extract_text(file_path: str) -> str: |
| ext = os.path.splitext(file_path)[-1].lower() |
|
|
| if ext == ".pdf": |
| if not HAS_PDF: |
| return "" |
| reader = PdfReader(file_path) |
| return " ".join(page.extract_text() or "" for page in reader.pages) |
|
|
| if ext == ".docx": |
| if not HAS_DOCX: |
| return "" |
| doc = DocxDocument(file_path) |
| return " ".join(p.text for p in doc.paragraphs) |
|
|
| if ext == ".pptx": |
| if not HAS_PPTX: |
| return "" |
| prs = pptx.Presentation(file_path) |
| texts = [] |
| for slide in prs.slides: |
| for shape in slide.shapes: |
| if hasattr(shape, "text"): |
| texts.append(shape.text) |
| return " ".join(texts) |
|
|
| if ext == ".txt": |
| with open(file_path, "r", encoding="utf-8", errors="ignore") as f: |
| return f.read() |
|
|
| return "" |
|
|
| |
| STOPWORDS = { |
| "with","and","the","for","are","you","will","have","this","that","from", |
| "our","your","about","who","their","them","into","such","also","not", |
| "but","can","all","has","its","was","were","been","more","than","when", |
| "which","these","those","some","what","very","just","over","then","each", |
| "much","well","need","must","use","may","any","new","per", |
| } |
|
|
| def extract_keywords(text: str) -> list: |
| words = re.findall(r'\b[a-zA-Z]{3,}\b', text.lower()) |
| return list({w for w in words if w not in STOPWORDS}) |
|
|
| |
| def score_documents(prompt: str, file_paths: list) -> pd.DataFrame: |
| model = get_model() |
| prompt_lower = prompt.lower() |
| keywords = extract_keywords(prompt_lower) |
|
|
| doc_texts, doc_names, doc_paths = [], [], [] |
| for fp in file_paths: |
| doc_texts.append(extract_text(fp).lower()) |
| doc_names.append(os.path.basename(fp)) |
| doc_paths.append(fp) |
|
|
| prompt_emb = model.encode([prompt_lower]) |
| doc_embs = model.encode(doc_texts) |
|
|
| index = faiss.IndexFlatL2(doc_embs.shape[1]) |
| index.add(np.array(doc_embs, dtype=np.float32)) |
| distances, indices = index.search(np.array(prompt_emb, dtype=np.float32), len(doc_names)) |
|
|
| rows = [] |
| for rank, idx in enumerate(indices[0]): |
| text = doc_texts[idx] |
| matches = sum(1 for k in keywords if k in text) |
| keyword_ratio = matches / max(len(keywords), 1) |
| sem_score = max(0.0, 100.0 - distances[0][rank] * 10) |
| final_score = min(sem_score, 20.0) if keyword_ratio < 0.05 else sem_score * keyword_ratio |
|
|
| rows.append({ |
| "File Name": doc_names[idx], |
| "_path": doc_paths[idx], |
| "Keyword Matches": matches, |
| "Keyword Coverage %": round(keyword_ratio * 100, 1), |
| "Semantic Score": round(sem_score, 2), |
| "Final Score": round(final_score, 2), |
| }) |
|
|
| df = pd.DataFrame(rows).sort_values("Final Score", ascending=False).reset_index(drop=True) |
| df.index += 1 |
| df.index.name = "Rank" |
| return df |
|
|
| |
| def build_zip(paths: list) -> str: |
| zip_path = os.path.join(tempfile.gettempdir(), "screened_documents.zip") |
| with zipfile.ZipFile(zip_path, "w", zipfile.ZIP_DEFLATED) as zf: |
| for fp in paths: |
| zf.write(fp, arcname=os.path.basename(fp)) |
| return zip_path |
|
|
| |
| def run_screening(prompt, files, top_n): |
| if not prompt or not prompt.strip(): |
| return None, "β οΈ Enter a screening prompt first.", None |
| if not files: |
| return None, "β οΈ Upload at least one document.", None |
|
|
| try: |
| df = score_documents(prompt, [f.name for f in files]) |
| except Exception as e: |
| return None, f"β Error: {e}", None |
|
|
| top_n = int(top_n) |
| top_df = df.head(top_n) |
|
|
| |
| shutil.rmtree(SCREENED_FOLDER, ignore_errors=True) |
| os.makedirs(SCREENED_FOLDER, exist_ok=True) |
|
|
| saved = [] |
| for _, row in top_df.iterrows(): |
| dest = os.path.join(SCREENED_FOLDER, row["File Name"]) |
| shutil.copy2(row["_path"], dest) |
| saved.append(dest) |
|
|
| zip_path = build_zip(saved) |
| display_df = top_df.drop(columns=["_path"]).reset_index() |
|
|
| |
| lines = [f"β
**{len(files)} document(s) screened** Β· Top **{top_n}** saved to `screened_documents/`\n"] |
| for _, row in top_df.iterrows(): |
| filled = int(row["Final Score"] / 100 * 20) |
| bar = "β" * filled + "β" * (20 - filled) |
| lines.append( |
| f"**{row['File Name']}**\n" |
| f"`{bar}` {row['Final Score']} " |
| f"| Keywords: {row['Keyword Matches']} | Semantic: {row['Semantic Score']}" |
| ) |
|
|
| return display_df, "\n\n".join(lines), zip_path |
|
|
| |
| with gr.Blocks( |
| title="AI Document Screening Agent", |
| theme=gr.themes.Soft( |
| primary_hue="purple", |
| secondary_hue="indigo", |
| neutral_hue="slate", |
| font=[gr.themes.GoogleFont("DM Sans"), "sans-serif"], |
| ), |
| css=""" |
| #banner { |
| background: linear-gradient(135deg, #6d28d9, #4f46e5); |
| border-radius: 12px; |
| padding: 20px 28px; |
| color: white; |
| margin-bottom: 4px; |
| } |
| #banner h1 { margin: 0; font-size: 1.8rem; font-weight: 800; } |
| #banner p { margin: 4px 0 0; opacity: 0.8; font-size: 0.9rem; } |
| footer { display: none !important; } |
| """, |
| ) as demo: |
|
|
| gr.HTML(""" |
| <div id="banner"> |
| <h1>π€ AI Document Screening Agent</h1> |
| <p>Semantic AI + Keyword matching Β· PDF Β· DOCX Β· PPTX Β· TXT</p> |
| </div> |
| """) |
|
|
| |
| with gr.Row(): |
| with gr.Column(scale=2): |
| prompt_box = gr.Textbox( |
| label="Screening Prompt", |
| placeholder="Describe what you are looking for in these documents...", |
| lines=5, |
| ) |
| with gr.Row(): |
| top_n_slider = gr.Slider(1, 20, value=5, step=1, label="Top N to screen") |
| screen_btn = gr.Button("π Run Screening", variant="primary") |
|
|
| with gr.Column(scale=1): |
| file_upload = gr.File( |
| label="Upload Documents", |
| file_types=[".pdf", ".docx", ".pptx", ".txt"], |
| file_count="multiple", |
| height=220, |
| ) |
|
|
| |
| with gr.Row(): |
| with gr.Column(scale=3): |
| result_table = gr.Dataframe( |
| label="π Scoreboard", |
| interactive=False, |
| wrap=True, |
| ) |
| with gr.Column(scale=2): |
| summary_md = gr.Markdown("*Results will appear here after screening.*") |
|
|
| |
| download_file = gr.File( |
| label="β¬οΈ Download Screened Documents (ZIP)", |
| interactive=False, |
| ) |
|
|
| gr.Markdown( |
| "---\n" |
| "**Scoring:** `Final Score = Semantic Score Γ Keyword Coverage`" |
| " β docs with < 5% keyword overlap are capped at 20. \n" |
| "*Author: Kajal Dadas Β· kajaldadas149@gmail.com*" |
| ) |
|
|
| screen_btn.click( |
| fn=run_screening, |
| inputs=[prompt_box, file_upload, top_n_slider], |
| outputs=[result_table, summary_md, download_file], |
| ) |
|
|
| if __name__ == "__main__": |
| demo.launch() |
|
|