Spaces:

csAhmad
/

CV_Job_Matching_AI_Model

Sleeping

App Files Files Community

csAhmad commited on Apr 27

Commit

7fdd999

verified ·

1 Parent(s): a4bb110

Update app.py

Browse files

Files changed (1) hide show

app.py +83 -30

app.py CHANGED Viewed

@@ -3,9 +3,13 @@ import zipfile
 import os
 import pandas as pd
 from sentence_transformers import SentenceTransformer
 from pypdf import PdfReader
 import docx
 model = SentenceTransformer("csAhmad/zoraiz-model")
 EXTRACT_PATH = "temp/extracted"
@@ -17,30 +21,55 @@ EXTRACT_PATH = "temp/extracted"
 def extract_text(file_path):
     path = file_path.lower()
-    # PDF
-    if path.endswith(".pdf"):
-        reader = PdfReader(file_path)
-        return " ".join([p.extract_text() or "" for p in reader.pages])
-    # DOCX
-    elif path.endswith(".docx"):
-        doc = docx.Document(file_path)
-        return "\n".join([para.text for para in doc.paragraphs])
-    return None
 # -------------------------
 # MAIN FUNCTION
 # -------------------------
-def process_zip(zip_file):
-    if zip_file is None:
-        raise gr.Error("Please upload a ZIP file.")
-    # reset folder
     if os.path.exists(EXTRACT_PATH):
-        for root, dirs, files in os.walk(EXTRACT_PATH):
             for f in files:
                 try:
                     os.remove(os.path.join(root, f))
@@ -58,34 +87,55 @@ def process_zip(zip_file):
     except zipfile.BadZipFile:
         raise gr.Error("Invalid ZIP file.")
     results = []
-    # 🔥 recursive scan (root + folders)
-    for root, dirs, files in os.walk(EXTRACT_PATH):
         for file in files:
             file_path = os.path.join(root, file)
             text = extract_text(file_path)
-            if not text or not text.strip():
                 continue
             try:
-                emb = model.encode(text)
                 results.append({
-                    "file": os.path.relpath(file_path, EXTRACT_PATH),
-                    "type": file.split(".")[-1],
-                    "text_length": len(text),
-                    "embedding_dim": len(emb),
-                    "preview_embedding": str(emb[:10])
                 })
             except Exception as e:
-                print(f"Embedding failed for {file_path}: {e}")
-    if len(results) == 0:
-        raise gr.Error("No readable PDF or DOCX files found in ZIP.")
     df = pd.DataFrame(results)
@@ -100,10 +150,13 @@ def process_zip(zip_file):
 # -------------------------
 demo = gr.Interface(
     fn=process_zip,
-    inputs=gr.File(file_types=[".zip"]),
-    outputs=gr.File(label="Download Excel"),
-    title="ZIP → PDF/DOCX Embeddings",
-    description="Upload ZIP containing PDFs and DOCX files (even inside folders). Outputs embeddings in Excel."
 )
 demo.launch()

 import os
 import pandas as pd
 from sentence_transformers import SentenceTransformer
+from sklearn.metrics.pairwise import cosine_similarity
 from pypdf import PdfReader
 import docx
+# -------------------------
+# MODEL
+# -------------------------
 model = SentenceTransformer("csAhmad/zoraiz-model")
 EXTRACT_PATH = "temp/extracted"
 def extract_text(file_path):
     path = file_path.lower()
+    try:
+        if path.endswith(".pdf"):
+            reader = PdfReader(file_path)
+            return " ".join([p.extract_text() or "" for p in reader.pages])
+        elif path.endswith(".docx"):
+            doc = docx.Document(file_path)
+            return "\n".join([para.text for para in doc.paragraphs])
+    except:
+        return ""
+    return ""
+# -------------------------
+# SIMPLE CV FIELD EXTRACTOR (replace with LLM later)
+# -------------------------
+def extract_cv_fields(text):
+    # ⚠️ placeholder logic (safe for HF Spaces demo)
+    lines = text.split("\n")
+    return {
+        "Name (Age)": lines[0] if len(lines) > 0 else "",
+        "Contact": "",
+        "Current Job": "",
+        "Qualification": "",
+        "Experience": "",
+        "Publications": "",
+        "Citation": "",
+        "H-index": "",
+        "Nationality": "",
+        "Other Achievements": "",
+        "Area": "",
+        "Comments": ""
+    }
 # -------------------------
 # MAIN FUNCTION
 # -------------------------
+def process_zip(zip_file, jd_text):
+    if zip_file is None or jd_text.strip() == "":
+        raise gr.Error("Please upload ZIP and enter Job Description.")
+    # clean folder
     if os.path.exists(EXTRACT_PATH):
+        for root, _, files in os.walk(EXTRACT_PATH):
             for f in files:
                 try:
                     os.remove(os.path.join(root, f))
     except zipfile.BadZipFile:
         raise gr.Error("Invalid ZIP file.")
+    # JD embedding
+    jd_embedding = model.encode(jd_text)
     results = []
+    # scan CVs
+    for root, _, files in os.walk(EXTRACT_PATH):
         for file in files:
             file_path = os.path.join(root, file)
             text = extract_text(file_path)
+            if not text.strip():
                 continue
             try:
+                cv_embedding = model.encode(text)
+                score = cosine_similarity(
+                    [cv_embedding],
+                    [jd_embedding]
+                )[0][0]
+                # filter threshold (adjust if needed)
+                if score < 0.60:
+                    continue
+                fields = extract_cv_fields(text)
                 results.append({
+                    "Name (Age)": fields["Name (Age)"],
+                    "Contact": fields["Contact"],
+                    "Current Job": fields["Current Job"],
+                    "Qualification": fields["Qualification"],
+                    "Experience": fields["Experience"],
+                    "Publications": fields["Publications"],
+                    "Citation": fields["Citation"],
+                    "H-index": fields["H-index"],
+                    "Nationality": fields["Nationality"],
+                    "Other Achievements": fields["Other Achievements"],
+                    "Area": fields["Area"],
+                    "Comments": fields["Comments"]
                 })
             except Exception as e:
+                print(f"Error processing {file}: {e}")
+    if not results:
+        raise gr.Error("No matching CVs found for this JD.")
     df = pd.DataFrame(results)
 # -------------------------
 demo = gr.Interface(
     fn=process_zip,
+    inputs=[
+        gr.File(file_types=[".zip"]),
+        gr.Textbox(lines=10, label="Job Description (JD)")
+    ],
+    outputs=gr.File(label="Download Filtered CV Excel"),
+    title="AI CV Screening System",
+    description="Upload ZIP of CVs + Job Description → Get ranked candidates in Excel"
 )
 demo.launch()