csAhmad commited on
Commit
7fdd999
·
verified ·
1 Parent(s): a4bb110

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +83 -30
app.py CHANGED
@@ -3,9 +3,13 @@ import zipfile
3
  import os
4
  import pandas as pd
5
  from sentence_transformers import SentenceTransformer
 
6
  from pypdf import PdfReader
7
  import docx
8
 
 
 
 
9
  model = SentenceTransformer("csAhmad/zoraiz-model")
10
 
11
  EXTRACT_PATH = "temp/extracted"
@@ -17,30 +21,55 @@ EXTRACT_PATH = "temp/extracted"
17
  def extract_text(file_path):
18
  path = file_path.lower()
19
 
20
- # PDF
21
- if path.endswith(".pdf"):
22
- reader = PdfReader(file_path)
23
- return " ".join([p.extract_text() or "" for p in reader.pages])
 
 
 
 
 
 
 
 
 
24
 
25
- # DOCX
26
- elif path.endswith(".docx"):
27
- doc = docx.Document(file_path)
28
- return "\n".join([para.text for para in doc.paragraphs])
29
 
30
- return None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
 
32
 
33
  # -------------------------
34
  # MAIN FUNCTION
35
  # -------------------------
36
- def process_zip(zip_file):
37
 
38
- if zip_file is None:
39
- raise gr.Error("Please upload a ZIP file.")
40
 
41
- # reset folder
42
  if os.path.exists(EXTRACT_PATH):
43
- for root, dirs, files in os.walk(EXTRACT_PATH):
44
  for f in files:
45
  try:
46
  os.remove(os.path.join(root, f))
@@ -58,34 +87,55 @@ def process_zip(zip_file):
58
  except zipfile.BadZipFile:
59
  raise gr.Error("Invalid ZIP file.")
60
 
 
 
 
61
  results = []
62
 
63
- # 🔥 recursive scan (root + folders)
64
- for root, dirs, files in os.walk(EXTRACT_PATH):
65
  for file in files:
66
  file_path = os.path.join(root, file)
67
 
68
  text = extract_text(file_path)
69
 
70
- if not text or not text.strip():
71
  continue
72
 
73
  try:
74
- emb = model.encode(text)
 
 
 
 
 
 
 
 
 
 
 
75
 
76
  results.append({
77
- "file": os.path.relpath(file_path, EXTRACT_PATH),
78
- "type": file.split(".")[-1],
79
- "text_length": len(text),
80
- "embedding_dim": len(emb),
81
- "preview_embedding": str(emb[:10])
 
 
 
 
 
 
 
82
  })
83
 
84
  except Exception as e:
85
- print(f"Embedding failed for {file_path}: {e}")
86
 
87
- if len(results) == 0:
88
- raise gr.Error("No readable PDF or DOCX files found in ZIP.")
89
 
90
  df = pd.DataFrame(results)
91
 
@@ -100,10 +150,13 @@ def process_zip(zip_file):
100
  # -------------------------
101
  demo = gr.Interface(
102
  fn=process_zip,
103
- inputs=gr.File(file_types=[".zip"]),
104
- outputs=gr.File(label="Download Excel"),
105
- title="ZIP PDF/DOCX Embeddings",
106
- description="Upload ZIP containing PDFs and DOCX files (even inside folders). Outputs embeddings in Excel."
 
 
 
107
  )
108
 
109
  demo.launch()
 
3
  import os
4
  import pandas as pd
5
  from sentence_transformers import SentenceTransformer
6
+ from sklearn.metrics.pairwise import cosine_similarity
7
  from pypdf import PdfReader
8
  import docx
9
 
10
+ # -------------------------
11
+ # MODEL
12
+ # -------------------------
13
  model = SentenceTransformer("csAhmad/zoraiz-model")
14
 
15
  EXTRACT_PATH = "temp/extracted"
 
21
  def extract_text(file_path):
22
  path = file_path.lower()
23
 
24
+ try:
25
+ if path.endswith(".pdf"):
26
+ reader = PdfReader(file_path)
27
+ return " ".join([p.extract_text() or "" for p in reader.pages])
28
+
29
+ elif path.endswith(".docx"):
30
+ doc = docx.Document(file_path)
31
+ return "\n".join([para.text for para in doc.paragraphs])
32
+
33
+ except:
34
+ return ""
35
+
36
+ return ""
37
 
 
 
 
 
38
 
39
+ # -------------------------
40
+ # SIMPLE CV FIELD EXTRACTOR (replace with LLM later)
41
+ # -------------------------
42
+ def extract_cv_fields(text):
43
+ # ⚠️ placeholder logic (safe for HF Spaces demo)
44
+ lines = text.split("\n")
45
+
46
+ return {
47
+ "Name (Age)": lines[0] if len(lines) > 0 else "",
48
+ "Contact": "",
49
+ "Current Job": "",
50
+ "Qualification": "",
51
+ "Experience": "",
52
+ "Publications": "",
53
+ "Citation": "",
54
+ "H-index": "",
55
+ "Nationality": "",
56
+ "Other Achievements": "",
57
+ "Area": "",
58
+ "Comments": ""
59
+ }
60
 
61
 
62
  # -------------------------
63
  # MAIN FUNCTION
64
  # -------------------------
65
+ def process_zip(zip_file, jd_text):
66
 
67
+ if zip_file is None or jd_text.strip() == "":
68
+ raise gr.Error("Please upload ZIP and enter Job Description.")
69
 
70
+ # clean folder
71
  if os.path.exists(EXTRACT_PATH):
72
+ for root, _, files in os.walk(EXTRACT_PATH):
73
  for f in files:
74
  try:
75
  os.remove(os.path.join(root, f))
 
87
  except zipfile.BadZipFile:
88
  raise gr.Error("Invalid ZIP file.")
89
 
90
+ # JD embedding
91
+ jd_embedding = model.encode(jd_text)
92
+
93
  results = []
94
 
95
+ # scan CVs
96
+ for root, _, files in os.walk(EXTRACT_PATH):
97
  for file in files:
98
  file_path = os.path.join(root, file)
99
 
100
  text = extract_text(file_path)
101
 
102
+ if not text.strip():
103
  continue
104
 
105
  try:
106
+ cv_embedding = model.encode(text)
107
+
108
+ score = cosine_similarity(
109
+ [cv_embedding],
110
+ [jd_embedding]
111
+ )[0][0]
112
+
113
+ # filter threshold (adjust if needed)
114
+ if score < 0.60:
115
+ continue
116
+
117
+ fields = extract_cv_fields(text)
118
 
119
  results.append({
120
+ "Name (Age)": fields["Name (Age)"],
121
+ "Contact": fields["Contact"],
122
+ "Current Job": fields["Current Job"],
123
+ "Qualification": fields["Qualification"],
124
+ "Experience": fields["Experience"],
125
+ "Publications": fields["Publications"],
126
+ "Citation": fields["Citation"],
127
+ "H-index": fields["H-index"],
128
+ "Nationality": fields["Nationality"],
129
+ "Other Achievements": fields["Other Achievements"],
130
+ "Area": fields["Area"],
131
+ "Comments": fields["Comments"]
132
  })
133
 
134
  except Exception as e:
135
+ print(f"Error processing {file}: {e}")
136
 
137
+ if not results:
138
+ raise gr.Error("No matching CVs found for this JD.")
139
 
140
  df = pd.DataFrame(results)
141
 
 
150
  # -------------------------
151
  demo = gr.Interface(
152
  fn=process_zip,
153
+ inputs=[
154
+ gr.File(file_types=[".zip"]),
155
+ gr.Textbox(lines=10, label="Job Description (JD)")
156
+ ],
157
+ outputs=gr.File(label="Download Filtered CV Excel"),
158
+ title="AI CV Screening System",
159
+ description="Upload ZIP of CVs + Job Description → Get ranked candidates in Excel"
160
  )
161
 
162
  demo.launch()