omgy commited on
Commit
1aa239f
·
verified ·
1 Parent(s): 626293c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +40 -17
app.py CHANGED
@@ -9,7 +9,7 @@ from pypdf import PdfReader
9
  from docx import Document
10
 
11
  # =========================
12
- # LOGGING SETUP
13
  # =========================
14
  logging.basicConfig(
15
  level=logging.INFO,
@@ -36,8 +36,20 @@ app = FastAPI(title="HF Resume Ingestion Service")
36
  # UTILITIES
37
  # =========================
38
 
 
 
 
 
 
 
 
 
 
 
39
  def download_file(url: str) -> bytes:
40
- logger.info(f"⬇️ Downloading resume: {url}")
 
 
41
  r = requests.get(url, timeout=30)
42
  r.raise_for_status()
43
  return r.content
@@ -54,20 +66,30 @@ def extract_text_from_docx(data: bytes) -> str:
54
 
55
 
56
  def extract_resume_text(resume_url: str) -> str:
 
 
 
57
  data = download_file(resume_url)
58
 
59
- if resume_url.lower().endswith(".pdf"):
 
60
  return extract_text_from_pdf(data)
61
- elif resume_url.lower().endswith(".docx"):
 
 
62
  return extract_text_from_docx(data)
63
- else:
64
- raise ValueError("Unsupported resume format (only pdf/docx)")
65
 
66
 
67
  def basic_skill_extraction(text: str) -> list[str]:
 
 
 
68
  COMMON_SKILLS = [
69
  "python", "java", "javascript", "react", "node",
70
- "firebase", "sql", "mongodb", "docker", "aws", "git", "linux"
 
71
  ]
72
  text_lower = text.lower()
73
  return sorted({skill for skill in COMMON_SKILLS if skill in text_lower})
@@ -103,30 +125,31 @@ async def upload_excel(file: UploadFile = File(...)):
103
  if not file.filename.endswith(".xlsx"):
104
  raise HTTPException(status_code=400, detail="Only .xlsx supported")
105
 
106
- # FIXED: BytesIO wrapper
107
  content = await file.read()
108
  df = pd.read_excel(io.BytesIO(content))
109
 
110
- required = {"name", "email", "phone", "jobId", "resume_url"}
111
- if not required.issubset(df.columns):
112
  raise HTTPException(
113
  status_code=400,
114
- detail=f"Invalid Excel format. Required columns: {required}",
115
  )
116
 
117
  report = []
118
 
119
  for index, row in df.iterrows():
120
- logger.info(f"👤 Processing row {index + 1}: {row.get('email')}")
 
121
 
122
  try:
123
- resume_text = extract_resume_text(row["resume_url"])
124
  skills = basic_skill_extraction(resume_text)
125
 
126
  payload = {
127
  "candidate": {
128
  "name": str(row["name"]),
129
- "email": str(row["email"]),
130
  "phone": str(row["phone"]),
131
  "jobId": str(row["jobId"]),
132
  },
@@ -139,15 +162,15 @@ async def upload_excel(file: UploadFile = File(...)):
139
  send_to_n8n(payload)
140
 
141
  report.append({
142
- "email": row["email"],
143
  "status": "sent",
144
  })
145
 
146
  except Exception as e:
147
- logger.error(f"❌ Error processing {row.get('email')}: {e}")
148
 
149
  report.append({
150
- "email": row.get("email", "unknown"),
151
  "status": "failed",
152
  "error": str(e),
153
  })
 
9
  from docx import Document
10
 
11
  # =========================
12
+ # LOGGING
13
  # =========================
14
  logging.basicConfig(
15
  level=logging.INFO,
 
36
  # UTILITIES
37
  # =========================
38
 
39
+ def normalize_resume_url(url: str) -> str:
40
+ """
41
+ Convert Google Drive share links to direct download URLs.
42
+ """
43
+ if "drive.google.com" in url and "/file/d/" in url:
44
+ file_id = url.split("/file/d/")[1].split("/")[0]
45
+ return f"https://drive.google.com/uc?export=download&id={file_id}"
46
+ return url
47
+
48
+
49
  def download_file(url: str) -> bytes:
50
+ url = normalize_resume_url(url)
51
+ logger.info(f"⬇️ Downloading resume (normalized): {url}")
52
+
53
  r = requests.get(url, timeout=30)
54
  r.raise_for_status()
55
  return r.content
 
66
 
67
 
68
  def extract_resume_text(resume_url: str) -> str:
69
+ """
70
+ Detect file type by content signature, not URL.
71
+ """
72
  data = download_file(resume_url)
73
 
74
+ # PDF signature
75
+ if data[:4] == b"%PDF":
76
  return extract_text_from_pdf(data)
77
+
78
+ # DOCX signature (ZIP)
79
+ if data[:2] == b"PK":
80
  return extract_text_from_docx(data)
81
+
82
+ raise ValueError("Unsupported resume format (only pdf/docx)")
83
 
84
 
85
  def basic_skill_extraction(text: str) -> list[str]:
86
+ """
87
+ Lightweight heuristic skill extraction (NO AI evaluation here).
88
+ """
89
  COMMON_SKILLS = [
90
  "python", "java", "javascript", "react", "node",
91
+ "firebase", "sql", "mongodb", "docker",
92
+ "aws", "git", "linux"
93
  ]
94
  text_lower = text.lower()
95
  return sorted({skill for skill in COMMON_SKILLS if skill in text_lower})
 
125
  if not file.filename.endswith(".xlsx"):
126
  raise HTTPException(status_code=400, detail="Only .xlsx supported")
127
 
128
+ # FIXED: wrap bytes in BytesIO
129
  content = await file.read()
130
  df = pd.read_excel(io.BytesIO(content))
131
 
132
+ required_columns = {"name", "email", "phone", "jobId", "resume_url"}
133
+ if not required_columns.issubset(df.columns):
134
  raise HTTPException(
135
  status_code=400,
136
+ detail=f"Invalid Excel format. Required columns: {required_columns}",
137
  )
138
 
139
  report = []
140
 
141
  for index, row in df.iterrows():
142
+ email = str(row.get("email"))
143
+ logger.info(f"👤 Processing row {index + 1}: {email}")
144
 
145
  try:
146
+ resume_text = extract_resume_text(str(row["resume_url"]))
147
  skills = basic_skill_extraction(resume_text)
148
 
149
  payload = {
150
  "candidate": {
151
  "name": str(row["name"]),
152
+ "email": email,
153
  "phone": str(row["phone"]),
154
  "jobId": str(row["jobId"]),
155
  },
 
162
  send_to_n8n(payload)
163
 
164
  report.append({
165
+ "email": email,
166
  "status": "sent",
167
  })
168
 
169
  except Exception as e:
170
+ logger.error(f"❌ Error processing {email}: {e}")
171
 
172
  report.append({
173
+ "email": email,
174
  "status": "failed",
175
  "error": str(e),
176
  })