Michtiii commited on
Commit
11339a8
Β·
verified Β·
1 Parent(s): 3f3bb41

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +114 -128
app.py CHANGED
@@ -1,12 +1,12 @@
1
  """
2
  AI Document Screening Agent β€” Gradio App for Hugging Face Spaces
3
  Author: Kajal Dadas | kajaldadas149@gmail.com
4
- Enhanced for HF Spaces deployment with Gradio UI
5
  """
6
 
7
  import os
8
  import re
9
  import shutil
 
10
  import tempfile
11
 
12
  import faiss
@@ -15,7 +15,7 @@ import pandas as pd
15
  import gradio as gr
16
  from sentence_transformers import SentenceTransformer
17
 
18
- # ── Optional parsers (graceful fallback if not installed) ──────────────────────
19
  try:
20
  from PyPDF2 import PdfReader
21
  HAS_PDF = True
@@ -34,14 +34,17 @@ try:
34
  except ImportError:
35
  HAS_PPTX = False
36
 
37
- # ── Model (cached globally for speed) ─────────────────────────────────────────
38
- MODEL_NAME = "all-MiniLM-L6-v2"
 
 
 
39
  _model = None
40
 
41
  def get_model():
42
  global _model
43
  if _model is None:
44
- _model = SentenceTransformer(MODEL_NAME)
45
  return _model
46
 
47
  # ── Text extraction ────────────────────────────────────────────────────────────
@@ -50,19 +53,19 @@ def extract_text(file_path: str) -> str:
50
 
51
  if ext == ".pdf":
52
  if not HAS_PDF:
53
- return "[PDF support unavailable β€” install PyPDF2]"
54
  reader = PdfReader(file_path)
55
  return " ".join(page.extract_text() or "" for page in reader.pages)
56
 
57
  if ext == ".docx":
58
  if not HAS_DOCX:
59
- return "[DOCX support unavailable β€” install python-docx]"
60
  doc = DocxDocument(file_path)
61
  return " ".join(p.text for p in doc.paragraphs)
62
 
63
  if ext == ".pptx":
64
  if not HAS_PPTX:
65
- return "[PPTX support unavailable β€” install python-pptx]"
66
  prs = pptx.Presentation(file_path)
67
  texts = []
68
  for slide in prs.slides:
@@ -75,206 +78,189 @@ def extract_text(file_path: str) -> str:
75
  with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
76
  return f.read()
77
 
78
- return f"[Unsupported file type: {ext}]"
79
 
80
  # ── Keyword helpers ────────────────────────────────────────────────────────────
81
  STOPWORDS = {
82
  "with","and","the","for","are","you","will","have","this","that","from",
83
  "our","your","about","who","their","them","into","such","also","not",
84
  "but","can","all","has","its","was","were","been","more","than","when",
85
- "which","these","those","some","what","very","just","over","then","than",
86
- "each","much","well","also","need","must","use","may","any","new","per",
87
  }
88
 
89
- def extract_keywords(text: str) -> list[str]:
90
  words = re.findall(r'\b[a-zA-Z]{3,}\b', text.lower())
91
  return list({w for w in words if w not in STOPWORDS})
92
 
93
  # ── Scoring engine ─────────────────────────────────────────────────────────────
94
- def score_documents(prompt: str, file_paths: list[str]) -> pd.DataFrame:
95
- if not prompt.strip():
96
- raise gr.Error("Please enter a job description / screening prompt.")
97
- if not file_paths:
98
- raise gr.Error("Please upload at least one document.")
99
-
100
- model = get_model()
101
- jd_lower = prompt.lower()
102
- jd_keywords = extract_keywords(jd_lower)
103
 
104
- doc_texts, doc_names = [], []
105
  for fp in file_paths:
106
- name = os.path.basename(fp)
107
- text = extract_text(fp).lower()
108
- doc_texts.append(text)
109
- doc_names.append(name)
110
 
111
- # Semantic embeddings
112
- jd_emb = model.encode([jd_lower])
113
- doc_embs = model.encode(doc_texts)
114
 
115
- dim = doc_embs.shape[1]
116
- index = faiss.IndexFlatL2(dim)
117
  index.add(np.array(doc_embs, dtype=np.float32))
118
-
119
- distances, indices = index.search(np.array(jd_emb, dtype=np.float32), len(doc_names))
120
 
121
  rows = []
122
  for rank, idx in enumerate(indices[0]):
123
- text = doc_texts[idx]
124
- matches = sum(1 for k in jd_keywords if k in text)
125
- keyword_ratio = matches / max(len(jd_keywords), 1)
126
- sem_score = max(0.0, 100.0 - distances[0][rank] * 10)
127
-
128
- # Strict scoring: penalise near-zero keyword overlap
129
- if keyword_ratio < 0.05:
130
- final_score = min(sem_score, 20.0)
131
- else:
132
- final_score = sem_score * keyword_ratio
133
 
134
  rows.append({
135
- "File Name": doc_names[idx],
136
- "Keyword Matches": matches,
 
137
  "Keyword Coverage %": round(keyword_ratio * 100, 1),
138
- "Semantic Score": round(sem_score, 2),
139
- "Final Score": round(final_score, 2),
140
  })
141
 
142
- df = pd.DataFrame(rows).sort_values("Final Score", ascending=False).reset_index(drop=True)
143
- df.index += 1
144
  df.index.name = "Rank"
145
  return df
146
 
147
- # ── Gradio interface ───────────────────────────────────────────────────────────
148
- DESCRIPTION = """
149
- ## πŸ€– AI Document Screening Agent
150
- Upload **any documents** (PDF, DOCX, PPTX, TXT) and describe what you're looking for.
151
- The agent combines **semantic AI matching** with **strict keyword coverage** to rank candidates.
152
-
153
- > *Built with Sentence-Transformers + FAISS Β· Supports PDF, DOCX, PPTX, TXT*
154
- """
155
 
 
156
  def run_screening(prompt, files, top_n):
157
- if files is None or len(files) == 0:
158
- return None, "⚠️ No files uploaded."
159
-
 
 
160
  try:
161
  df = score_documents(prompt, [f.name for f in files])
162
- except gr.Error as e:
163
- return None, str(e)
164
  except Exception as e:
165
- return None, f"❌ Error: {e}"
 
 
 
 
 
 
 
 
 
 
 
 
 
166
 
167
- top_df = df.head(int(top_n))
168
- summary_lines = [f"βœ… Screened **{len(files)} document(s)** Β· Showing top **{int(top_n)}** results\n"]
 
 
 
169
  for _, row in top_df.iterrows():
170
- bar_filled = int(row["Final Score"] / 100 * 20)
171
- bar = "β–ˆ" * bar_filled + "β–‘" * (20 - bar_filled)
172
- summary_lines.append(
173
  f"**{row['File Name']}**\n"
174
- f"`{bar}` {row['Final Score']}% "
175
- f"| Keywords: {row['Keyword Matches']} | Semantic: {row['Semantic Score']}"
176
  )
177
- return top_df.reset_index(), "\n\n".join(summary_lines)
178
 
 
179
 
 
180
  with gr.Blocks(
181
  title="AI Document Screening Agent",
182
  theme=gr.themes.Soft(
183
- primary_hue="violet",
184
- secondary_hue="purple",
185
  neutral_hue="slate",
186
  font=[gr.themes.GoogleFont("DM Sans"), "sans-serif"],
187
  ),
188
  css="""
189
- #title-banner {
190
- background: linear-gradient(135deg, #6d28d9 0%, #7c3aed 50%, #4f46e5 100%);
191
- border-radius: 14px;
192
- padding: 24px 32px;
193
- margin-bottom: 8px;
194
  color: white;
 
195
  }
196
- #title-banner h1 { margin: 0; font-size: 2rem; font-weight: 800; }
197
- #title-banner p { margin: 6px 0 0; opacity: 0.85; font-size: 0.95rem; }
198
- .gr-button-primary { background: #7c3aed !important; }
199
  footer { display: none !important; }
200
  """,
201
  ) as demo:
202
 
203
  gr.HTML("""
204
- <div id="title-banner">
205
  <h1>πŸ€– AI Document Screening Agent</h1>
206
  <p>Semantic AI + Keyword matching Β· PDF Β· DOCX Β· PPTX Β· TXT</p>
207
  </div>
208
  """)
209
 
 
210
  with gr.Row():
211
  with gr.Column(scale=2):
212
  prompt_box = gr.Textbox(
213
- label="πŸ“‹ Job Description / Screening Prompt",
214
- placeholder=(
215
- "e.g. Looking for a senior Python developer with experience in "
216
- "machine learning, FastAPI, Docker, and AWS. Strong communication skills required."
217
- ),
218
- lines=6,
219
- show_copy_button=True,
220
  )
221
  with gr.Row():
222
- top_n_slider = gr.Slider(
223
- minimum=1, maximum=20, value=5, step=1,
224
- label="Top N results to highlight",
225
- )
226
- screen_btn = gr.Button("πŸ” Screen Documents", variant="primary", scale=1)
227
 
228
  with gr.Column(scale=1):
229
  file_upload = gr.File(
230
- label="πŸ“ Upload Documents",
231
  file_types=[".pdf", ".docx", ".pptx", ".txt"],
232
  file_count="multiple",
233
- height=200,
234
  )
235
 
 
236
  with gr.Row():
237
- with gr.Column():
238
  result_table = gr.Dataframe(
239
- label="πŸ“Š Screening Scoreboard",
240
- headers=["Rank", "File Name", "Keyword Matches", "Keyword Coverage %", "Semantic Score", "Final Score"],
241
  interactive=False,
242
  wrap=True,
243
  )
244
- with gr.Column():
245
- summary_box = gr.Markdown(label="πŸ“ Summary", value="*Results will appear here after screening.*")
246
 
247
- screen_btn.click(
248
- fn=run_screening,
249
- inputs=[prompt_box, file_upload, top_n_slider],
250
- outputs=[result_table, summary_box],
251
- api_name="screen",
252
  )
253
 
254
- gr.Examples(
255
- examples=[
256
- [
257
- "Looking for a data scientist with Python, machine learning, TensorFlow, SQL, and data visualisation skills. PhD preferred.",
258
- None, 5
259
- ],
260
- [
261
- "Hiring a frontend engineer with React, TypeScript, CSS, and experience in responsive design and accessibility.",
262
- None, 3
263
- ],
264
- ],
265
- inputs=[prompt_box, file_upload, top_n_slider],
266
- label="πŸ’‘ Example Prompts",
267
  )
268
 
269
- gr.Markdown(
270
- """
271
- ---
272
- **How scoring works:**
273
- `Final Score = Semantic Score Γ— Keyword Coverage` β€” documents with < 5 % keyword overlap are capped at 20.
274
- Built with πŸ€— `sentence-transformers/all-MiniLM-L6-v2` + FAISS.
275
- *Author: Kajal Dadas Β· kajaldadas149@gmail.com*
276
- """,
277
- elem_id="footer-note",
278
  )
279
 
280
  if __name__ == "__main__":
 
1
  """
2
  AI Document Screening Agent β€” Gradio App for Hugging Face Spaces
3
  Author: Kajal Dadas | kajaldadas149@gmail.com
 
4
  """
5
 
6
  import os
7
  import re
8
  import shutil
9
+ import zipfile
10
  import tempfile
11
 
12
  import faiss
 
15
  import gradio as gr
16
  from sentence_transformers import SentenceTransformer
17
 
18
+ # ── Optional parsers ───────────────────────────────────────────────────────────
19
  try:
20
  from PyPDF2 import PdfReader
21
  HAS_PDF = True
 
34
  except ImportError:
35
  HAS_PPTX = False
36
 
37
+ # ── Screened output folder ─────────────────────────────────────────────────────
38
+ SCREENED_FOLDER = "screened_documents"
39
+ os.makedirs(SCREENED_FOLDER, exist_ok=True)
40
+
41
+ # ── Model (cached) ─────────────────────────────────────────────────────────────
42
  _model = None
43
 
44
  def get_model():
45
  global _model
46
  if _model is None:
47
+ _model = SentenceTransformer("all-MiniLM-L6-v2")
48
  return _model
49
 
50
  # ── Text extraction ────────────────────────────────────────────────────────────
 
53
 
54
  if ext == ".pdf":
55
  if not HAS_PDF:
56
+ return ""
57
  reader = PdfReader(file_path)
58
  return " ".join(page.extract_text() or "" for page in reader.pages)
59
 
60
  if ext == ".docx":
61
  if not HAS_DOCX:
62
+ return ""
63
  doc = DocxDocument(file_path)
64
  return " ".join(p.text for p in doc.paragraphs)
65
 
66
  if ext == ".pptx":
67
  if not HAS_PPTX:
68
+ return ""
69
  prs = pptx.Presentation(file_path)
70
  texts = []
71
  for slide in prs.slides:
 
78
  with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
79
  return f.read()
80
 
81
+ return ""
82
 
83
  # ── Keyword helpers ────────────────────────────────────────────────────────────
84
  STOPWORDS = {
85
  "with","and","the","for","are","you","will","have","this","that","from",
86
  "our","your","about","who","their","them","into","such","also","not",
87
  "but","can","all","has","its","was","were","been","more","than","when",
88
+ "which","these","those","some","what","very","just","over","then","each",
89
+ "much","well","need","must","use","may","any","new","per",
90
  }
91
 
92
+ def extract_keywords(text: str) -> list:
93
  words = re.findall(r'\b[a-zA-Z]{3,}\b', text.lower())
94
  return list({w for w in words if w not in STOPWORDS})
95
 
96
  # ── Scoring engine ─────────────────────────────────────────────────────────────
97
+ def score_documents(prompt: str, file_paths: list) -> pd.DataFrame:
98
+ model = get_model()
99
+ prompt_lower = prompt.lower()
100
+ keywords = extract_keywords(prompt_lower)
 
 
 
 
 
101
 
102
+ doc_texts, doc_names, doc_paths = [], [], []
103
  for fp in file_paths:
104
+ doc_texts.append(extract_text(fp).lower())
105
+ doc_names.append(os.path.basename(fp))
106
+ doc_paths.append(fp)
 
107
 
108
+ prompt_emb = model.encode([prompt_lower])
109
+ doc_embs = model.encode(doc_texts)
 
110
 
111
+ index = faiss.IndexFlatL2(doc_embs.shape[1])
 
112
  index.add(np.array(doc_embs, dtype=np.float32))
113
+ distances, indices = index.search(np.array(prompt_emb, dtype=np.float32), len(doc_names))
 
114
 
115
  rows = []
116
  for rank, idx in enumerate(indices[0]):
117
+ text = doc_texts[idx]
118
+ matches = sum(1 for k in keywords if k in text)
119
+ keyword_ratio = matches / max(len(keywords), 1)
120
+ sem_score = max(0.0, 100.0 - distances[0][rank] * 10)
121
+ final_score = min(sem_score, 20.0) if keyword_ratio < 0.05 else sem_score * keyword_ratio
 
 
 
 
 
122
 
123
  rows.append({
124
+ "File Name": doc_names[idx],
125
+ "_path": doc_paths[idx],
126
+ "Keyword Matches": matches,
127
  "Keyword Coverage %": round(keyword_ratio * 100, 1),
128
+ "Semantic Score": round(sem_score, 2),
129
+ "Final Score": round(final_score, 2),
130
  })
131
 
132
+ df = pd.DataFrame(rows).sort_values("Final Score", ascending=False).reset_index(drop=True)
133
+ df.index += 1
134
  df.index.name = "Rank"
135
  return df
136
 
137
+ # ── ZIP builder ────────────────────────────────────────────────────────────────
138
+ def build_zip(paths: list) -> str:
139
+ zip_path = os.path.join(tempfile.gettempdir(), "screened_documents.zip")
140
+ with zipfile.ZipFile(zip_path, "w", zipfile.ZIP_DEFLATED) as zf:
141
+ for fp in paths:
142
+ zf.write(fp, arcname=os.path.basename(fp))
143
+ return zip_path
 
144
 
145
+ # ── Main handler ───────────────────────────────────────────────────────────────
146
  def run_screening(prompt, files, top_n):
147
+ if not prompt or not prompt.strip():
148
+ return None, "⚠️ Enter a screening prompt first.", None
149
+ if not files:
150
+ return None, "⚠️ Upload at least one document.", None
151
+
152
  try:
153
  df = score_documents(prompt, [f.name for f in files])
 
 
154
  except Exception as e:
155
+ return None, f"❌ Error: {e}", None
156
+
157
+ top_n = int(top_n)
158
+ top_df = df.head(top_n)
159
+
160
+ # ── Save top docs to screened_documents/ ──────────────────────────────────
161
+ shutil.rmtree(SCREENED_FOLDER, ignore_errors=True)
162
+ os.makedirs(SCREENED_FOLDER, exist_ok=True)
163
+
164
+ saved = []
165
+ for _, row in top_df.iterrows():
166
+ dest = os.path.join(SCREENED_FOLDER, row["File Name"])
167
+ shutil.copy2(row["_path"], dest)
168
+ saved.append(dest)
169
 
170
+ zip_path = build_zip(saved)
171
+ display_df = top_df.drop(columns=["_path"]).reset_index()
172
+
173
+ # ── Summary text ──────────────────────────────────────────────────────────
174
+ lines = [f"βœ… **{len(files)} document(s) screened** Β· Top **{top_n}** saved to `screened_documents/`\n"]
175
  for _, row in top_df.iterrows():
176
+ filled = int(row["Final Score"] / 100 * 20)
177
+ bar = "β–ˆ" * filled + "β–‘" * (20 - filled)
178
+ lines.append(
179
  f"**{row['File Name']}**\n"
180
+ f"`{bar}` {row['Final Score']} "
181
+ f"| Keywords: {row['Keyword Matches']} | Semantic: {row['Semantic Score']}"
182
  )
 
183
 
184
+ return display_df, "\n\n".join(lines), zip_path
185
 
186
+ # ── Gradio UI ──────────────────────────────────────────────────────────────────
187
  with gr.Blocks(
188
  title="AI Document Screening Agent",
189
  theme=gr.themes.Soft(
190
+ primary_hue="purple",
191
+ secondary_hue="indigo",
192
  neutral_hue="slate",
193
  font=[gr.themes.GoogleFont("DM Sans"), "sans-serif"],
194
  ),
195
  css="""
196
+ #banner {
197
+ background: linear-gradient(135deg, #6d28d9, #4f46e5);
198
+ border-radius: 12px;
199
+ padding: 20px 28px;
 
200
  color: white;
201
+ margin-bottom: 4px;
202
  }
203
+ #banner h1 { margin: 0; font-size: 1.8rem; font-weight: 800; }
204
+ #banner p { margin: 4px 0 0; opacity: 0.8; font-size: 0.9rem; }
 
205
  footer { display: none !important; }
206
  """,
207
  ) as demo:
208
 
209
  gr.HTML("""
210
+ <div id="banner">
211
  <h1>πŸ€– AI Document Screening Agent</h1>
212
  <p>Semantic AI + Keyword matching Β· PDF Β· DOCX Β· PPTX Β· TXT</p>
213
  </div>
214
  """)
215
 
216
+ # ── Inputs ─────────────────────────────────────────────────────────────────
217
  with gr.Row():
218
  with gr.Column(scale=2):
219
  prompt_box = gr.Textbox(
220
+ label="Screening Prompt",
221
+ placeholder="Describe what you are looking for in these documents...",
222
+ lines=5,
 
 
 
 
223
  )
224
  with gr.Row():
225
+ top_n_slider = gr.Slider(1, 20, value=5, step=1, label="Top N to screen")
226
+ screen_btn = gr.Button("πŸ” Run Screening", variant="primary")
 
 
 
227
 
228
  with gr.Column(scale=1):
229
  file_upload = gr.File(
230
+ label="Upload Documents",
231
  file_types=[".pdf", ".docx", ".pptx", ".txt"],
232
  file_count="multiple",
233
+ height=220,
234
  )
235
 
236
+ # ── Results ────────────────────────────────────────────────────────────────
237
  with gr.Row():
238
+ with gr.Column(scale=3):
239
  result_table = gr.Dataframe(
240
+ label="πŸ“Š Scoreboard",
 
241
  interactive=False,
242
  wrap=True,
243
  )
244
+ with gr.Column(scale=2):
245
+ summary_md = gr.Markdown("*Results will appear here after screening.*")
246
 
247
+ # ── Download ───────────────────────────────────────────────────────────────
248
+ download_file = gr.File(
249
+ label="⬇️ Download Screened Documents (ZIP)",
250
+ interactive=False,
 
251
  )
252
 
253
+ gr.Markdown(
254
+ "---\n"
255
+ "**Scoring:** `Final Score = Semantic Score Γ— Keyword Coverage`"
256
+ " β€” docs with < 5% keyword overlap are capped at 20. \n"
257
+ "*Author: Kajal Dadas Β· kajaldadas149@gmail.com*"
 
 
 
 
 
 
 
 
258
  )
259
 
260
+ screen_btn.click(
261
+ fn=run_screening,
262
+ inputs=[prompt_box, file_upload, top_n_slider],
263
+ outputs=[result_table, summary_md, download_file],
 
 
 
 
 
264
  )
265
 
266
  if __name__ == "__main__":