bangaboy commited on
Commit
de06fc0
Β·
verified Β·
1 Parent(s): 10f784f

Update src/streamlit_app.py

Browse files
Files changed (1) hide show
  1. src/streamlit_app.py +130 -299
src/streamlit_app.py CHANGED
@@ -7,87 +7,22 @@ import json
7
  from pathlib import Path
8
  from datetime import datetime
9
  import re
10
- import os
11
  import io
12
- import tempfile
13
- import sys
14
-
15
- # Set page config first
16
- st.set_page_config(
17
- page_title="Resume Parser",
18
- page_icon="πŸ“„",
19
- layout="wide"
20
- )
21
-
22
- # Handle Streamlit configuration for Hugging Face Spaces
23
- if not os.path.exists(os.path.expanduser("~/.streamlit")):
24
- try:
25
- os.makedirs(os.path.expanduser("~/.streamlit"), exist_ok=True)
26
- except:
27
- pass # Ignore permission errors
28
 
29
  def extract_text_from_pdf(pdf_file):
30
- """Extract text from uploaded PDF file with better error handling."""
 
31
  try:
32
- # Read PDF bytes
33
  pdf_bytes = pdf_file.read()
34
- pdf_file.seek(0) # Reset file pointer
35
-
36
- # Debug info
37
- st.write(f"πŸ“– PDF file size: {len(pdf_bytes)} bytes")
38
-
39
- if len(pdf_bytes) == 0:
40
- st.error("❌ PDF file is empty")
41
- return ""
42
-
43
- # Create temporary file for PyMuPDF
44
- with tempfile.NamedTemporaryFile(suffix='.pdf', delete=False) as tmp_file:
45
- tmp_file.write(pdf_bytes)
46
- tmp_file_path = tmp_file.name
47
-
48
- try:
49
- # Open PDF document
50
- doc = fitz.open(tmp_file_path)
51
- st.write(f"πŸ“„ PDF has {len(doc)} pages")
52
-
53
- text_content = []
54
- for page_num in range(len(doc)):
55
- page = doc[page_num]
56
- page_text = page.get_text()
57
- text_content.append(page_text)
58
- st.write(f"Page {page_num + 1}: {len(page_text)} characters")
59
-
60
- doc.close()
61
- full_text = "\n".join(text_content)
62
- st.write(f"βœ… Total text extracted: {len(full_text)} characters")
63
-
64
- return full_text
65
-
66
- finally:
67
- # Clean up temporary file
68
- try:
69
- os.unlink(tmp_file_path)
70
- except:
71
- pass
72
-
73
  except Exception as e:
74
- st.error(f"❌ PDF extraction error: {str(e)}")
75
- # Try alternative approach with stream
76
- try:
77
- pdf_file.seek(0)
78
- pdf_bytes = pdf_file.read()
79
- doc = fitz.open(stream=pdf_bytes, filetype="pdf")
80
-
81
- text_content = []
82
- for page_num in range(len(doc)):
83
- page = doc[page_num]
84
- text_content.append(page.get_text())
85
-
86
- doc.close()
87
- return "\n".join(text_content)
88
- except Exception as e2:
89
- st.error(f"❌ Alternative PDF extraction also failed: {str(e2)}")
90
- return ""
91
 
92
  def extract_text_from_docx(docx_file):
93
  """Extract text from uploaded DOCX file."""
@@ -96,21 +31,20 @@ def extract_text_from_docx(docx_file):
96
  text_content = []
97
  for paragraph in doc.paragraphs:
98
  text_content.append(paragraph.text)
99
-
100
- full_text = "\n".join(text_content)
101
- st.write(f"βœ… DOCX text extracted: {len(full_text)} characters")
102
- return full_text
103
  except Exception as e:
104
- st.error(f"❌ DOCX extraction error: {str(e)}")
105
  return ""
106
 
107
  def parse_date(date_str):
108
  """Parse date from various formats."""
109
  try:
 
110
  if date_str.lower() in ['present', 'current', 'now']:
111
  return datetime.now()
112
 
113
  date_str = date_str.strip()
 
114
  formats = [
115
  '%Y', '%b %Y', '%B %Y', '%m/%Y', '%m-%Y',
116
  '%Y/%m', '%Y-%m'
@@ -133,7 +67,8 @@ def parse_date(date_str):
133
  def calculate_experience(work_history):
134
  """Calculate total years of experience from work history."""
135
  total_experience = 0
136
-
 
137
  for job in work_history:
138
  duration = job.get('duration', '')
139
  if not duration:
@@ -153,107 +88,87 @@ def calculate_experience(work_history):
153
 
154
  return round(total_experience, 1)
155
 
156
- def get_api_key():
157
- """Get API key from environment or user input."""
158
- # Try environment variable (Hugging Face Spaces)
159
- api_key = os.getenv("GEMINI_API_KEY")
160
- if api_key:
161
- return api_key
162
-
163
- # Try Streamlit secrets (fallback)
164
- try:
165
- if hasattr(st, 'secrets') and st.secrets and "GEMINI_API_KEY" in st.secrets:
166
- return st.secrets["GEMINI_API_KEY"]
167
- except:
168
- pass
169
-
170
- return None
171
-
172
- def parse_resume_with_gemini(text_content, api_key):
173
- """Use Gemini API to parse resume text."""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
174
  try:
175
- # Configure API
176
- genai.configure(api_key=api_key)
177
- model = genai.GenerativeModel('gemini-1.5-flash')
178
-
179
- prompt = """You are a resume parser. Extract the following information from this resume text and return ONLY a valid JSON object:
180
-
181
- {
182
- "summary": "Brief 100-word summary of the candidate",
183
- "name": "Full name",
184
- "email": "Email address",
185
- "phone": "Phone number",
186
- "education": [
187
- {
188
- "degree": "Degree name",
189
- "institution": "School/University name",
190
- "year": "Graduation year",
191
- "field": "Field of study",
192
- "gpa": "GPA if mentioned"
193
- }
194
- ],
195
- "work_experience": [
196
- {
197
- "company": "Company name",
198
- "position": "Job title",
199
- "duration": "Employment period (e.g., Jan 2020 - Present)"
200
- }
201
- ],
202
- "skills": ["List of skills, tools, technologies"],
203
- "linkedin": "LinkedIn profile URL"
204
- }
205
-
206
- Rules:
207
- - Return ONLY valid JSON, no other text
208
- - If information is not found, use empty string or empty array
209
- - For dates, use format like "Jan 2020 - Dec 2022" or "2020 - Present"
210
- - Include all technical skills, programming languages, tools mentioned
211
-
212
- Resume text:
213
- """
214
-
215
- # Make API call
216
- response = model.generate_content(prompt + text_content)
217
-
218
- if not response or not response.text:
219
- st.error("❌ Empty response from Gemini API")
220
  return None
 
 
221
 
222
- # Clean and parse JSON
223
- response_text = response.text.strip()
224
-
225
- # Find JSON in response
226
- json_start = response_text.find('{')
227
- json_end = response_text.rfind('}') + 1
228
-
229
- if json_start == -1 or json_end <= json_start:
230
- st.error("❌ No valid JSON found in response")
231
- st.code(response_text[:500])
 
 
 
232
  return None
233
-
234
- json_str = response_text[json_start:json_end]
235
-
236
- # Parse JSON
237
- result = json.loads(json_str)
238
-
239
- # Add calculated experience
240
- total_exp = calculate_experience(result.get('work_experience', []))
241
- result['total_years_experience'] = total_exp
242
-
243
- return result
244
-
245
- except json.JSONDecodeError as e:
246
- st.error(f"❌ JSON parsing error: {e}")
247
- st.code(json_str if 'json_str' in locals() else "No JSON extracted")
248
- return None
249
  except Exception as e:
250
- st.error(f"❌ API error: {e}")
251
- if "API_KEY" in str(e) or "key" in str(e).lower():
252
- st.error("πŸ”‘ Please check your Gemini API key")
253
  return None
254
 
255
  def format_education(edu):
256
- """Format education for display."""
257
  parts = []
258
  if edu.get('degree'):
259
  parts.append(edu['degree'])
@@ -265,152 +180,68 @@ def format_education(edu):
265
  parts.append(f"({edu['year']})")
266
  if edu.get('gpa') and edu['gpa'].strip():
267
  parts.append(f"- GPA: {edu['gpa']}")
268
- return " ".join(parts) if parts else "No details available"
269
 
270
  def main():
271
- st.title("πŸ” Resume Parser")
272
- st.markdown("Upload a resume (PDF, DOCX, or Image) to extract structured information")
273
-
274
- # API Key handling
275
- api_key = get_api_key()
276
-
277
- if not api_key:
278
- st.warning("⚠️ No API key found in environment variables")
279
- api_key = st.text_input(
280
- "Enter your Gemini API Key:",
281
- type="password",
282
- help="Get your API key from https://makersuite.google.com/app/apikey"
283
- )
284
- if not api_key:
285
- st.info("πŸ‘† Please enter your Gemini API key to continue")
286
- st.stop()
287
- else:
288
- st.success("βœ… API key loaded from environment")
289
-
290
- # File upload
291
- uploaded_file = st.file_uploader(
292
- "Choose a resume file",
293
- type=["pdf", "docx", "doc", "jpg", "jpeg", "png"],
294
- help="Supported formats: PDF, DOCX, DOC, JPG, JPEG, PNG"
295
- )
296
-
297
- if uploaded_file is not None:
298
- st.write(f"πŸ“ File uploaded: **{uploaded_file.name}** ({uploaded_file.size} bytes)")
299
-
300
- # Process file
301
- with st.spinner("πŸ”„ Processing resume..."):
302
- # Extract text based on file type
303
- file_extension = Path(uploaded_file.name).suffix.lower()
304
-
305
- text_content = ""
306
- if file_extension == '.pdf':
307
- text_content = extract_text_from_pdf(uploaded_file)
308
- elif file_extension in ['.docx', '.doc']:
309
- text_content = extract_text_from_docx(uploaded_file)
310
- elif file_extension in ['.jpg', '.jpeg', '.png']:
311
- st.info("πŸ“· Image processing requires OCR - this may take longer")
312
- try:
313
- image = Image.open(uploaded_file)
314
- import pytesseract
315
- text_content = pytesseract.image_to_string(image)
316
- st.write(f"βœ… OCR extracted: {len(text_content)} characters")
317
- except ImportError:
318
- st.error("❌ OCR not available. Please use PDF or DOCX files.")
319
- return
320
- except Exception as e:
321
- st.error(f"❌ OCR error: {e}")
322
- return
323
- else:
324
- st.error(f"❌ Unsupported file type: {file_extension}")
325
- return
326
-
327
- # Check if text was extracted
328
- if not text_content or len(text_content.strip()) < 20:
329
- st.error("❌ Could not extract meaningful text from the file")
330
- st.info("πŸ’‘ Try using a different file format or ensure the file contains readable text")
331
- if text_content:
332
- st.write("Extracted text preview:")
333
- st.code(text_content[:200])
334
- return
335
-
336
- # Show text preview
337
- with st.expander("πŸ“„ View extracted text (first 300 characters)"):
338
- st.code(text_content[:300] + "..." if len(text_content) > 300 else text_content)
339
-
340
- # Parse with Gemini
341
- st.write("πŸ€– Analyzing with AI...")
342
- result = parse_resume_with_gemini(text_content, api_key)
343
-
344
  if result:
345
- st.success("βœ… Resume parsed successfully!")
346
 
347
- # Display results
348
- st.header("πŸ“Š Extracted Information")
349
 
350
- # Summary
351
- if result.get('summary'):
352
- st.subheader("πŸ“ Summary")
353
- st.write(result['summary'])
354
-
355
- # Personal info
356
- st.subheader("πŸ‘€ Personal Information")
357
  col1, col2, col3 = st.columns(3)
358
-
359
  with col1:
360
- st.metric("Name", result.get('name', 'Not found'))
361
  with col2:
362
- st.metric("Email", result.get('email', 'Not found'))
363
  with col3:
364
- st.metric("Phone", result.get('phone', 'Not found'))
365
-
366
- # Experience
367
  total_exp = result.get('total_years_experience', 0)
368
- if total_exp > 0:
369
- exp_text = f"{total_exp:.1f} years" if total_exp >= 1 else f"{total_exp * 12:.0f} months"
370
- st.metric("Total Experience", exp_text)
371
-
372
- # Education
373
- st.subheader("πŸŽ“ Education")
374
  if result.get('education'):
375
  for edu in result['education']:
376
- st.write(f"β€’ {format_education(edu)}")
377
  else:
378
  st.write("No education information found")
379
-
380
- # Work Experience
381
- st.subheader("πŸ’Ό Work Experience")
382
  if result.get('work_experience'):
383
  for exp in result['work_experience']:
384
  duration = f" ({exp.get('duration', 'Duration not specified')})" if exp.get('duration') else ""
385
- st.write(f"β€’ **{exp.get('position', 'Position not found')}** at {exp.get('company', 'Company not found')}{duration}")
386
  else:
387
  st.write("No work experience found")
388
-
389
- # Skills
390
- st.subheader("πŸ› οΈ Skills")
391
  if result.get('skills'):
392
- # Display skills as tags
393
- skills_text = " β€’ ".join(result['skills'])
394
- st.write(skills_text)
395
  else:
396
- st.write("No skills found")
397
-
398
- # LinkedIn
399
- if result.get('linkedin'):
400
- st.subheader("πŸ”— LinkedIn Profile")
401
- st.write(result['linkedin'])
402
-
403
- # Download results
404
- st.subheader("πŸ’Ύ Download Results")
405
- json_str = json.dumps(result, indent=2)
406
- st.download_button(
407
- label="Download JSON",
408
- data=json_str,
409
- file_name=f"resume_parsed_{uploaded_file.name}.json",
410
- mime="application/json"
411
- )
412
- else:
413
- st.error("❌ Failed to parse resume. Please check the file and try again.")
414
 
415
  if __name__ == "__main__":
416
  main()
 
7
  from pathlib import Path
8
  from datetime import datetime
9
  import re
10
+ import pytesseract
11
  import io
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
 
13
  def extract_text_from_pdf(pdf_file):
14
+ """Extract text from uploaded PDF file."""
15
+ text_content = []
16
  try:
 
17
  pdf_bytes = pdf_file.read()
18
+ doc = fitz.open(stream=pdf_bytes, filetype="pdf")
19
+ for page_num in range(len(doc)):
20
+ page = doc[page_num]
21
+ text_content.append(page.get_text())
22
+ return "\n".join(text_content)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
  except Exception as e:
24
+ st.error(f"Error in PDF extraction: {str(e)}")
25
+ return ""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
 
27
  def extract_text_from_docx(docx_file):
28
  """Extract text from uploaded DOCX file."""
 
31
  text_content = []
32
  for paragraph in doc.paragraphs:
33
  text_content.append(paragraph.text)
34
+ return "\n".join(text_content)
 
 
 
35
  except Exception as e:
36
+ st.error(f"Error in DOCX extraction: {str(e)}")
37
  return ""
38
 
39
  def parse_date(date_str):
40
  """Parse date from various formats."""
41
  try:
42
+ # Handle 'Present' or 'Current'
43
  if date_str.lower() in ['present', 'current', 'now']:
44
  return datetime.now()
45
 
46
  date_str = date_str.strip()
47
+
48
  formats = [
49
  '%Y', '%b %Y', '%B %Y', '%m/%Y', '%m-%Y',
50
  '%Y/%m', '%Y-%m'
 
67
  def calculate_experience(work_history):
68
  """Calculate total years of experience from work history."""
69
  total_experience = 0
70
+ current_year = datetime.now().year
71
+
72
  for job in work_history:
73
  duration = job.get('duration', '')
74
  if not duration:
 
88
 
89
  return round(total_experience, 1)
90
 
91
+ def parse_resume(file_uploaded, api_key):
92
+ """Parse resume and extract information."""
93
+ genai.configure(api_key=api_key)
94
+ model = genai.GenerativeModel('gemini-1.5-flash')
95
+
96
+ prompt = """Extract the following information from this resume:
97
+ 1. Summarize the following resume in 100 words, focusing on key skills, experience, and qualifications
98
+ 2. Full Name
99
+ 3. Email Address
100
+ 4. Phone Number
101
+ 5. Education History (including degree, institution, graduation year, and field of study)
102
+ 6. Companies worked at with positions and EXACT duration (e.g., "Jan 2020 - Present" or "2018-2020")
103
+ 7. Skills
104
+ 8. LinkedIn Profile URL
105
+
106
+ Return the information in this JSON format:
107
+ {
108
+ "summary": "",
109
+ "name": "",
110
+ "email": "",
111
+ "phone": "",
112
+ "education": [
113
+ {
114
+ "degree": "",
115
+ "institution": "",
116
+ "year": "",
117
+ "field": "",
118
+ "gpa": ""
119
+ }
120
+ ],
121
+ "work_experience": [
122
+ {
123
+ "company": "",
124
+ "position": "",
125
+ "duration": ""
126
+ }
127
+ ],
128
+ "skills": [],
129
+ "linkedin": ""
130
+ }
131
+ For skills include tools and technologies in output if present any in resume.
132
+ For work experience durations, please specify exact dates in format: "MMM YYYY - MMM YYYY" or "YYYY - Present" , please return in one order either in ascending or descending.
133
+ Only return the JSON object, nothing else. If any field is not found, leave it empty."""
134
+
135
  try:
136
+ file_extension = Path(file_uploaded.name).suffix.lower()
137
+
138
+ if file_extension == '.pdf':
139
+ text_content = extract_text_from_pdf(file_uploaded)
140
+ elif file_extension in ['.docx', '.doc']:
141
+ text_content = extract_text_from_docx(file_uploaded)
142
+ elif file_extension in ['.jpg', '.jpeg', '.png']:
143
+ image = Image.open(file_uploaded)
144
+ text_content = pytesseract.image_to_string(image)
145
+ else:
146
+ st.error(f"Unsupported file format: {file_extension}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
147
  return None
148
+
149
+ response = model.generate_content(f"{prompt}\n\nResume Text:\n{text_content}")
150
 
151
+ try:
152
+ response_text = response.text
153
+ json_start = response_text.find('{')
154
+ json_end = response_text.rfind('}') + 1
155
+ json_str = response_text[json_start:json_end]
156
+
157
+ result = json.loads(json_str)
158
+ total_exp = calculate_experience(result.get('work_experience', []))
159
+ result['total_years_experience'] = total_exp
160
+
161
+ return result
162
+ except json.JSONDecodeError as e:
163
+ st.error(f"Error parsing response: {str(e)}")
164
  return None
165
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
166
  except Exception as e:
167
+ st.error(f"Error processing resume: {str(e)}")
 
 
168
  return None
169
 
170
  def format_education(edu):
171
+ """Format education details for display."""
172
  parts = []
173
  if edu.get('degree'):
174
  parts.append(edu['degree'])
 
180
  parts.append(f"({edu['year']})")
181
  if edu.get('gpa') and edu['gpa'].strip():
182
  parts.append(f"- GPA: {edu['gpa']}")
183
+ return " ".join(parts)
184
 
185
  def main():
186
+ st.title("Resume Parser")
187
+ st.write("Upload a resume (PDF, DOCX, or Image) to extract information")
188
+
189
+ # Get API key from secrets or user input
190
+ api_key = st.secrets["GEMINI_API_KEY"] if "GEMINI_API_KEY" in st.secrets else st.text_input("Enter Gemini API Key", type="password")
191
+
192
+ uploaded_file = st.file_uploader("Choose a resume file", type=["pdf", "docx", "doc", "jpg", "jpeg", "png"])
193
+
194
+ if uploaded_file and api_key:
195
+ with st.spinner('Analyzing resume...'):
196
+ result = parse_resume(uploaded_file, api_key)
197
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
198
  if result:
199
+ st.subheader("Extracted Information")
200
 
201
+ # Display summary in a text area
202
+ st.text_area("Summary", result.get('summary', 'Not found'), height=100)
203
 
204
+ # Display personal information
 
 
 
 
 
 
205
  col1, col2, col3 = st.columns(3)
 
206
  with col1:
207
+ st.write("**Name:**", result.get('name', 'Not found'))
208
  with col2:
209
+ st.write("**Email:**", result.get('email', 'Not found'))
210
  with col3:
211
+ st.write("**Phone:**", result.get('phone', 'Not found'))
212
+
213
+ # Display total experience
214
  total_exp = result.get('total_years_experience', 0)
215
+ exp_text = f"{total_exp:.1f} years" if total_exp >= 1 else f"{total_exp * 12:.0f} months"
216
+ st.write("**Total Experience:**", exp_text)
217
+
218
+ # Display education
219
+ st.subheader("Education")
 
220
  if result.get('education'):
221
  for edu in result['education']:
222
+ st.write(f"- {format_education(edu)}")
223
  else:
224
  st.write("No education information found")
225
+
226
+ # Display work experience
227
+ st.subheader("Work Experience")
228
  if result.get('work_experience'):
229
  for exp in result['work_experience']:
230
  duration = f" ({exp.get('duration', 'Duration not specified')})" if exp.get('duration') else ""
231
+ st.write(f"- {exp.get('position', 'Role not found')} at {exp.get('company', 'Company not found')}{duration}")
232
  else:
233
  st.write("No work experience found")
234
+
235
+ # Display Skills
236
+ st.subheader("Skills:")
237
  if result.get('skills'):
238
+ for skill in result['skills']:
239
+ st.write(f"- {skill}")
 
240
  else:
241
+ st.write("- No skills found")
242
+
243
+ # Display LinkedIn profile
244
+ st.write("**LinkedIn Profile:**", result.get('linkedin', 'Not found'))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
245
 
246
  if __name__ == "__main__":
247
  main()