Spaces:

Michtiii
/

AI_Dcouments_Screening_Agent

Sleeping

App Files Files Community

Michtiii commited on Mar 26

Commit

11339a8

verified ·

1 Parent(s): 3f3bb41

Upload app.py

Browse files

Files changed (1) hide show

app.py +114 -128

app.py CHANGED Viewed

@@ -1,12 +1,12 @@
 """
 AI Document Screening Agent — Gradio App for Hugging Face Spaces
 Author: Kajal Dadas | kajaldadas149@gmail.com
-Enhanced for HF Spaces deployment with Gradio UI
 """
 import os
 import re
 import shutil
 import tempfile
 import faiss
@@ -15,7 +15,7 @@ import pandas as pd
 import gradio as gr
 from sentence_transformers import SentenceTransformer
-# ── Optional parsers (graceful fallback if not installed) ──────────────────────
 try:
     from PyPDF2 import PdfReader
     HAS_PDF = True
@@ -34,14 +34,17 @@ try:
 except ImportError:
     HAS_PPTX = False
-# ── Model (cached globally for speed) ─────────────────────────────────────────
-MODEL_NAME = "all-MiniLM-L6-v2"
 _model = None
 def get_model():
     global _model
     if _model is None:
-        _model = SentenceTransformer(MODEL_NAME)
     return _model
 # ── Text extraction ────────────────────────────────────────────────────────────
@@ -50,19 +53,19 @@ def extract_text(file_path: str) -> str:
     if ext == ".pdf":
         if not HAS_PDF:
-            return "[PDF support unavailable — install PyPDF2]"
         reader = PdfReader(file_path)
         return " ".join(page.extract_text() or "" for page in reader.pages)
     if ext == ".docx":
         if not HAS_DOCX:
-            return "[DOCX support unavailable — install python-docx]"
         doc = DocxDocument(file_path)
         return " ".join(p.text for p in doc.paragraphs)
     if ext == ".pptx":
         if not HAS_PPTX:
-            return "[PPTX support unavailable — install python-pptx]"
         prs = pptx.Presentation(file_path)
         texts = []
         for slide in prs.slides:
@@ -75,206 +78,189 @@ def extract_text(file_path: str) -> str:
         with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
             return f.read()
-    return f"[Unsupported file type: {ext}]"
 # ── Keyword helpers ────────────────────────────────────────────────────────────
 STOPWORDS = {
     "with","and","the","for","are","you","will","have","this","that","from",
     "our","your","about","who","their","them","into","such","also","not",
     "but","can","all","has","its","was","were","been","more","than","when",
-    "which","these","those","some","what","very","just","over","then","than",
-    "each","much","well","also","need","must","use","may","any","new","per",
 }
-def extract_keywords(text: str) -> list[str]:
     words = re.findall(r'\b[a-zA-Z]{3,}\b', text.lower())
     return list({w for w in words if w not in STOPWORDS})
 # ── Scoring engine ─────────────────────────────────────────────────────────────
-def score_documents(prompt: str, file_paths: list[str]) -> pd.DataFrame:
-    if not prompt.strip():
-        raise gr.Error("Please enter a job description / screening prompt.")
-    if not file_paths:
-        raise gr.Error("Please upload at least one document.")
-    model = get_model()
-    jd_lower = prompt.lower()
-    jd_keywords = extract_keywords(jd_lower)
-    doc_texts, doc_names = [], []
     for fp in file_paths:
-        name = os.path.basename(fp)
-        text = extract_text(fp).lower()
-        doc_texts.append(text)
-        doc_names.append(name)
-    # Semantic embeddings
-    jd_emb = model.encode([jd_lower])
-    doc_embs = model.encode(doc_texts)
-    dim = doc_embs.shape[1]
-    index = faiss.IndexFlatL2(dim)
     index.add(np.array(doc_embs, dtype=np.float32))
-    distances, indices = index.search(np.array(jd_emb, dtype=np.float32), len(doc_names))
     rows = []
     for rank, idx in enumerate(indices[0]):
-        text = doc_texts[idx]
-        matches = sum(1 for k in jd_keywords if k in text)
-        keyword_ratio = matches / max(len(jd_keywords), 1)
-        sem_score = max(0.0, 100.0 - distances[0][rank] * 10)
-        # Strict scoring: penalise near-zero keyword overlap
-        if keyword_ratio < 0.05:
-            final_score = min(sem_score, 20.0)
-        else:
-            final_score = sem_score * keyword_ratio
         rows.append({
-            "File Name":         doc_names[idx],
-            "Keyword Matches":   matches,
             "Keyword Coverage %": round(keyword_ratio * 100, 1),
-            "Semantic Score":    round(sem_score, 2),
-            "Final Score":       round(final_score, 2),
         })
-    df = pd.DataFrame(rows).sort_values("Final Score", ascending=False).reset_index(drop=True)
-    df.index += 1
     df.index.name = "Rank"
     return df
-# ── Gradio interface ───────────────────────────────────────────────────────────
-DESCRIPTION = """
-## 🤖 AI Document Screening Agent
-Upload **any documents** (PDF, DOCX, PPTX, TXT) and describe what you're looking for.
-The agent combines **semantic AI matching** with **strict keyword coverage** to rank candidates.
-> *Built with Sentence-Transformers + FAISS · Supports PDF, DOCX, PPTX, TXT*
-"""
 def run_screening(prompt, files, top_n):
-    if files is None or len(files) == 0:
-        return None, "⚠️ No files uploaded."
     try:
         df = score_documents(prompt, [f.name for f in files])
-    except gr.Error as e:
-        return None, str(e)
     except Exception as e:
-        return None, f"❌ Error: {e}"
-    top_df = df.head(int(top_n))
-    summary_lines = [f"✅ Screened **{len(files)} document(s)** · Showing top **{int(top_n)}** results\n"]
     for _, row in top_df.iterrows():
-        bar_filled = int(row["Final Score"] / 100 * 20)
-        bar = "█" * bar_filled + "░" * (20 - bar_filled)
-        summary_lines.append(
             f"**{row['File Name']}**\n"
-            f"`{bar}` {row['Final Score']}%  "
-            f"| Keywords: {row['Keyword Matches']} | Semantic: {row['Semantic Score']}"
         )
-    return top_df.reset_index(), "\n\n".join(summary_lines)
 with gr.Blocks(
     title="AI Document Screening Agent",
     theme=gr.themes.Soft(
-        primary_hue="violet",
-        secondary_hue="purple",
         neutral_hue="slate",
         font=[gr.themes.GoogleFont("DM Sans"), "sans-serif"],
     ),
     css="""
-        #title-banner {
-            background: linear-gradient(135deg, #6d28d9 0%, #7c3aed 50%, #4f46e5 100%);
-            border-radius: 14px;
-            padding: 24px 32px;
-            margin-bottom: 8px;
             color: white;
         }
-        #title-banner h1 { margin: 0; font-size: 2rem; font-weight: 800; }
-        #title-banner p  { margin: 6px 0 0; opacity: 0.85; font-size: 0.95rem; }
-        .gr-button-primary { background: #7c3aed !important; }
         footer { display: none !important; }
     """,
 ) as demo:
     gr.HTML("""
-        <div id="title-banner">
             <h1>🤖 AI Document Screening Agent</h1>
             <p>Semantic AI + Keyword matching · PDF · DOCX · PPTX · TXT</p>
         </div>
     """)
     with gr.Row():
         with gr.Column(scale=2):
             prompt_box = gr.Textbox(
-                label="📋 Job Description / Screening Prompt",
-                placeholder=(
-                    "e.g. Looking for a senior Python developer with experience in "
-                    "machine learning, FastAPI, Docker, and AWS. Strong communication skills required."
-                ),
-                lines=6,
-                show_copy_button=True,
             )
             with gr.Row():
-                top_n_slider = gr.Slider(
-                    minimum=1, maximum=20, value=5, step=1,
-                    label="Top N results to highlight",
-                )
-                screen_btn = gr.Button("🔍 Screen Documents", variant="primary", scale=1)
         with gr.Column(scale=1):
             file_upload = gr.File(
-                label="📁 Upload Documents",
                 file_types=[".pdf", ".docx", ".pptx", ".txt"],
                 file_count="multiple",
-                height=200,
             )
     with gr.Row():
-        with gr.Column():
             result_table = gr.Dataframe(
-                label="📊 Screening Scoreboard",
-                headers=["Rank", "File Name", "Keyword Matches", "Keyword Coverage %", "Semantic Score", "Final Score"],
                 interactive=False,
                 wrap=True,
             )
-        with gr.Column():
-            summary_box = gr.Markdown(label="📝 Summary", value="*Results will appear here after screening.*")
-    screen_btn.click(
-        fn=run_screening,
-        inputs=[prompt_box, file_upload, top_n_slider],
-        outputs=[result_table, summary_box],
-        api_name="screen",
     )
-    gr.Examples(
-        examples=[
-            [
-                "Looking for a data scientist with Python, machine learning, TensorFlow, SQL, and data visualisation skills. PhD preferred.",
-                None, 5
-            ],
-            [
-                "Hiring a frontend engineer with React, TypeScript, CSS, and experience in responsive design and accessibility.",
-                None, 3
-            ],
-        ],
-        inputs=[prompt_box, file_upload, top_n_slider],
-        label="💡 Example Prompts",
     )
-    gr.Markdown(
-        """
-        ---
-        **How scoring works:**
-        `Final Score = Semantic Score × Keyword Coverage`  — documents with < 5 % keyword overlap are capped at 20.
-        Built with 🤗 `sentence-transformers/all-MiniLM-L6-v2` + FAISS.
-        *Author: Kajal Dadas · kajaldadas149@gmail.com*
-        """,
-        elem_id="footer-note",
     )
 if __name__ == "__main__":

 """
 AI Document Screening Agent — Gradio App for Hugging Face Spaces
 Author: Kajal Dadas | kajaldadas149@gmail.com
 """
 import os
 import re
 import shutil
+import zipfile
 import tempfile
 import faiss
 import gradio as gr
 from sentence_transformers import SentenceTransformer
+# ── Optional parsers ───────────────────────────────────────────────────────────
 try:
     from PyPDF2 import PdfReader
     HAS_PDF = True
 except ImportError:
     HAS_PPTX = False
+# ── Screened output folder ─────────────────────────────────────────────────────
+SCREENED_FOLDER = "screened_documents"
+os.makedirs(SCREENED_FOLDER, exist_ok=True)
+# ── Model (cached) ─────────────────────────────────────────────────────────────
 _model = None
 def get_model():
     global _model
     if _model is None:
+        _model = SentenceTransformer("all-MiniLM-L6-v2")
     return _model
 # ── Text extraction ────────────────────────────────────────────────────────────
     if ext == ".pdf":
         if not HAS_PDF:
+            return ""
         reader = PdfReader(file_path)
         return " ".join(page.extract_text() or "" for page in reader.pages)
     if ext == ".docx":
         if not HAS_DOCX:
+            return ""
         doc = DocxDocument(file_path)
         return " ".join(p.text for p in doc.paragraphs)
     if ext == ".pptx":
         if not HAS_PPTX:
+            return ""
         prs = pptx.Presentation(file_path)
         texts = []
         for slide in prs.slides:
         with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
             return f.read()
+    return ""
 # ── Keyword helpers ────────────────────────────────────────────────────────────
 STOPWORDS = {
     "with","and","the","for","are","you","will","have","this","that","from",
     "our","your","about","who","their","them","into","such","also","not",
     "but","can","all","has","its","was","were","been","more","than","when",
+    "which","these","those","some","what","very","just","over","then","each",
+    "much","well","need","must","use","may","any","new","per",
 }
+def extract_keywords(text: str) -> list:
     words = re.findall(r'\b[a-zA-Z]{3,}\b', text.lower())
     return list({w for w in words if w not in STOPWORDS})
 # ── Scoring engine ─────────────────────────────────────────────────────────────
+def score_documents(prompt: str, file_paths: list) -> pd.DataFrame:
+    model        = get_model()
+    prompt_lower = prompt.lower()
+    keywords     = extract_keywords(prompt_lower)
+    doc_texts, doc_names, doc_paths = [], [], []
     for fp in file_paths:
+        doc_texts.append(extract_text(fp).lower())
+        doc_names.append(os.path.basename(fp))
+        doc_paths.append(fp)
+    prompt_emb = model.encode([prompt_lower])
+    doc_embs   = model.encode(doc_texts)
+    index = faiss.IndexFlatL2(doc_embs.shape[1])
     index.add(np.array(doc_embs, dtype=np.float32))
+    distances, indices = index.search(np.array(prompt_emb, dtype=np.float32), len(doc_names))
     rows = []
     for rank, idx in enumerate(indices[0]):
+        text          = doc_texts[idx]
+        matches       = sum(1 for k in keywords if k in text)
+        keyword_ratio = matches / max(len(keywords), 1)
+        sem_score     = max(0.0, 100.0 - distances[0][rank] * 10)
+        final_score   = min(sem_score, 20.0) if keyword_ratio < 0.05 else sem_score * keyword_ratio
         rows.append({
+            "File Name":          doc_names[idx],
+            "_path":              doc_paths[idx],
+            "Keyword Matches":    matches,
             "Keyword Coverage %": round(keyword_ratio * 100, 1),
+            "Semantic Score":     round(sem_score, 2),
+            "Final Score":        round(final_score, 2),
         })
+    df            = pd.DataFrame(rows).sort_values("Final Score", ascending=False).reset_index(drop=True)
+    df.index     += 1
     df.index.name = "Rank"
     return df
+# ── ZIP builder ────────────────────────────────────────────────────────────────
+def build_zip(paths: list) -> str:
+    zip_path = os.path.join(tempfile.gettempdir(), "screened_documents.zip")
+    with zipfile.ZipFile(zip_path, "w", zipfile.ZIP_DEFLATED) as zf:
+        for fp in paths:
+            zf.write(fp, arcname=os.path.basename(fp))
+    return zip_path
+# ── Main handler ───────────────────────────────────────────────────────────────
 def run_screening(prompt, files, top_n):
+    if not prompt or not prompt.strip():
+        return None, "⚠️ Enter a screening prompt first.", None
+    if not files:
+        return None, "⚠️ Upload at least one document.", None
     try:
         df = score_documents(prompt, [f.name for f in files])
     except Exception as e:
+        return None, f"❌ Error: {e}", None
+    top_n  = int(top_n)
+    top_df = df.head(top_n)
+    # ── Save top docs to screened_documents/ ──────────────────────────────────
+    shutil.rmtree(SCREENED_FOLDER, ignore_errors=True)
+    os.makedirs(SCREENED_FOLDER, exist_ok=True)
+    saved = []
+    for _, row in top_df.iterrows():
+        dest = os.path.join(SCREENED_FOLDER, row["File Name"])
+        shutil.copy2(row["_path"], dest)
+        saved.append(dest)
+    zip_path   = build_zip(saved)
+    display_df = top_df.drop(columns=["_path"]).reset_index()
+    # ── Summary text ──────────────────────────────────────────────────────────
+    lines = [f"✅ **{len(files)} document(s) screened** · Top **{top_n}** saved to `screened_documents/`\n"]
     for _, row in top_df.iterrows():
+        filled = int(row["Final Score"] / 100 * 20)
+        bar    = "█" * filled + "░" * (20 - filled)
+        lines.append(
             f"**{row['File Name']}**\n"
+            f"`{bar}` {row['Final Score']}  "
+            f"| Keywords: {row['Keyword Matches']}  | Semantic: {row['Semantic Score']}"
         )
+    return display_df, "\n\n".join(lines), zip_path
+# ── Gradio UI ──────────────────────────────────────────────────────────────────
 with gr.Blocks(
     title="AI Document Screening Agent",
     theme=gr.themes.Soft(
+        primary_hue="purple",
+        secondary_hue="indigo",
         neutral_hue="slate",
         font=[gr.themes.GoogleFont("DM Sans"), "sans-serif"],
     ),
     css="""
+        #banner {
+            background: linear-gradient(135deg, #6d28d9, #4f46e5);
+            border-radius: 12px;
+            padding: 20px 28px;
             color: white;
+            margin-bottom: 4px;
         }
+        #banner h1 { margin: 0; font-size: 1.8rem; font-weight: 800; }
+        #banner p  { margin: 4px 0 0; opacity: 0.8; font-size: 0.9rem; }
         footer { display: none !important; }
     """,
 ) as demo:
     gr.HTML("""
+        <div id="banner">
             <h1>🤖 AI Document Screening Agent</h1>
             <p>Semantic AI + Keyword matching · PDF · DOCX · PPTX · TXT</p>
         </div>
     """)
+    # ── Inputs ─────────────────────────────────────────────────────────────────
     with gr.Row():
         with gr.Column(scale=2):
             prompt_box = gr.Textbox(
+                label="Screening Prompt",
+                placeholder="Describe what you are looking for in these documents...",
+                lines=5,
             )
             with gr.Row():
+                top_n_slider = gr.Slider(1, 20, value=5, step=1, label="Top N to screen")
+                screen_btn   = gr.Button("🔍  Run Screening", variant="primary")
         with gr.Column(scale=1):
             file_upload = gr.File(
+                label="Upload Documents",
                 file_types=[".pdf", ".docx", ".pptx", ".txt"],
                 file_count="multiple",
+                height=220,
             )
+    # ── Results ────────────────────────────────────────────────────────────────
     with gr.Row():
+        with gr.Column(scale=3):
             result_table = gr.Dataframe(
+                label="📊 Scoreboard",
                 interactive=False,
                 wrap=True,
             )
+        with gr.Column(scale=2):
+            summary_md = gr.Markdown("*Results will appear here after screening.*")
+    # ── Download ───────────────────────────────────────────────────────────────
+    download_file = gr.File(
+        label="⬇️ Download Screened Documents (ZIP)",
+        interactive=False,
     )
+    gr.Markdown(
+        "---\n"
+        "**Scoring:** `Final Score = Semantic Score × Keyword Coverage`"
+        " — docs with < 5% keyword overlap are capped at 20.  \n"
+        "*Author: Kajal Dadas · kajaldadas149@gmail.com*"
     )
+    screen_btn.click(
+        fn=run_screening,
+        inputs=[prompt_box, file_upload, top_n_slider],
+        outputs=[result_table, summary_md, download_file],
     )
 if __name__ == "__main__":