Spaces:

omgy
/

resume

Sleeping

App Files Files Community

omgy commited on Dec 14, 2025

Commit

1aa239f

verified ·

1 Parent(s): 626293c

Update app.py

Browse files

Files changed (1) hide show

app.py +40 -17

app.py CHANGED Viewed

@@ -9,7 +9,7 @@ from pypdf import PdfReader
 from docx import Document
 # =========================
-# LOGGING SETUP
 # =========================
 logging.basicConfig(
     level=logging.INFO,
@@ -36,8 +36,20 @@ app = FastAPI(title="HF Resume Ingestion Service")
 # UTILITIES
 # =========================
 def download_file(url: str) -> bytes:
-    logger.info(f"⬇️ Downloading resume: {url}")
     r = requests.get(url, timeout=30)
     r.raise_for_status()
     return r.content
@@ -54,20 +66,30 @@ def extract_text_from_docx(data: bytes) -> str:
 def extract_resume_text(resume_url: str) -> str:
     data = download_file(resume_url)
-    if resume_url.lower().endswith(".pdf"):
         return extract_text_from_pdf(data)
-    elif resume_url.lower().endswith(".docx"):
         return extract_text_from_docx(data)
-    else:
-        raise ValueError("Unsupported resume format (only pdf/docx)")
 def basic_skill_extraction(text: str) -> list[str]:
     COMMON_SKILLS = [
         "python", "java", "javascript", "react", "node",
-        "firebase", "sql", "mongodb", "docker", "aws", "git", "linux"
     ]
     text_lower = text.lower()
     return sorted({skill for skill in COMMON_SKILLS if skill in text_lower})
@@ -103,30 +125,31 @@ async def upload_excel(file: UploadFile = File(...)):
     if not file.filename.endswith(".xlsx"):
         raise HTTPException(status_code=400, detail="Only .xlsx supported")
-    # ✅ FIXED: BytesIO wrapper
     content = await file.read()
     df = pd.read_excel(io.BytesIO(content))
-    required = {"name", "email", "phone", "jobId", "resume_url"}
-    if not required.issubset(df.columns):
         raise HTTPException(
             status_code=400,
-            detail=f"Invalid Excel format. Required columns: {required}",
         )
     report = []
     for index, row in df.iterrows():
-        logger.info(f"👤 Processing row {index + 1}: {row.get('email')}")
         try:
-            resume_text = extract_resume_text(row["resume_url"])
             skills = basic_skill_extraction(resume_text)
             payload = {
                 "candidate": {
                     "name": str(row["name"]),
-                    "email": str(row["email"]),
                     "phone": str(row["phone"]),
                     "jobId": str(row["jobId"]),
                 },
@@ -139,15 +162,15 @@ async def upload_excel(file: UploadFile = File(...)):
             send_to_n8n(payload)
             report.append({
-                "email": row["email"],
                 "status": "sent",
             })
         except Exception as e:
-            logger.error(f"❌ Error processing {row.get('email')}: {e}")
             report.append({
-                "email": row.get("email", "unknown"),
                 "status": "failed",
                 "error": str(e),
             })

 from docx import Document
 # =========================
+# LOGGING
 # =========================
 logging.basicConfig(
     level=logging.INFO,
 # UTILITIES
 # =========================
+def normalize_resume_url(url: str) -> str:
+    """
+    Convert Google Drive share links to direct download URLs.
+    """
+    if "drive.google.com" in url and "/file/d/" in url:
+        file_id = url.split("/file/d/")[1].split("/")[0]
+        return f"https://drive.google.com/uc?export=download&id={file_id}"
+    return url
 def download_file(url: str) -> bytes:
+    url = normalize_resume_url(url)
+    logger.info(f"⬇️ Downloading resume (normalized): {url}")
     r = requests.get(url, timeout=30)
     r.raise_for_status()
     return r.content
 def extract_resume_text(resume_url: str) -> str:
+    """
+    Detect file type by content signature, not URL.
+    """
     data = download_file(resume_url)
+    # PDF signature
+    if data[:4] == b"%PDF":
         return extract_text_from_pdf(data)
+    # DOCX signature (ZIP)
+    if data[:2] == b"PK":
         return extract_text_from_docx(data)
+    raise ValueError("Unsupported resume format (only pdf/docx)")
 def basic_skill_extraction(text: str) -> list[str]:
+    """
+    Lightweight heuristic skill extraction (NO AI evaluation here).
+    """
     COMMON_SKILLS = [
         "python", "java", "javascript", "react", "node",
+        "firebase", "sql", "mongodb", "docker",
+        "aws", "git", "linux"
     ]
     text_lower = text.lower()
     return sorted({skill for skill in COMMON_SKILLS if skill in text_lower})
     if not file.filename.endswith(".xlsx"):
         raise HTTPException(status_code=400, detail="Only .xlsx supported")
+    # FIXED: wrap bytes in BytesIO
     content = await file.read()
     df = pd.read_excel(io.BytesIO(content))
+    required_columns = {"name", "email", "phone", "jobId", "resume_url"}
+    if not required_columns.issubset(df.columns):
         raise HTTPException(
             status_code=400,
+            detail=f"Invalid Excel format. Required columns: {required_columns}",
         )
     report = []
     for index, row in df.iterrows():
+        email = str(row.get("email"))
+        logger.info(f"👤 Processing row {index + 1}: {email}")
         try:
+            resume_text = extract_resume_text(str(row["resume_url"]))
             skills = basic_skill_extraction(resume_text)
             payload = {
                 "candidate": {
                     "name": str(row["name"]),
+                    "email": email,
                     "phone": str(row["phone"]),
                     "jobId": str(row["jobId"]),
                 },
             send_to_n8n(payload)
             report.append({
+                "email": email,
                 "status": "sent",
             })
         except Exception as e:
+            logger.error(f"❌ Error processing {email}: {e}")
             report.append({
+                "email": email,
                 "status": "failed",
                 "error": str(e),
             })