bangaboy commited on
Commit
0132466
·
verified ·
1 Parent(s): 34860cb

Update src/streamlit_app.py

Browse files
Files changed (1) hide show
  1. src/streamlit_app.py +128 -180
src/streamlit_app.py CHANGED
@@ -1,251 +1,199 @@
 
 
1
  import os
2
- os.environ["STREAMLIT_HOME"] = "/app/.streamlit"
 
3
  import streamlit as st
4
  import google.generativeai as genai
5
  from PIL import Image
6
  import fitz # PyMuPDF
7
  from docx import Document
 
 
8
  import json
9
  from pathlib import Path
10
  from datetime import datetime
11
  import re
12
- import pytesseract
13
- import io
14
-
15
 
 
16
  def extract_text_from_pdf(pdf_file):
17
- """Extract text from uploaded PDF file."""
18
  text_content = []
 
19
  try:
20
- pdf_bytes = pdf_file.read()
21
  doc = fitz.open(stream=pdf_bytes, filetype="pdf")
22
- for page_num in range(len(doc)):
23
- page = doc[page_num]
24
- text_content.append(page.get_text())
 
 
 
 
 
25
  return "\n".join(text_content)
26
  except Exception as e:
27
- st.error(f"Error in PDF extraction: {str(e)}")
28
  return ""
29
 
30
  def extract_text_from_docx(docx_file):
31
- """Extract text from uploaded DOCX file."""
32
  try:
33
  doc = Document(docx_file)
34
- text_content = []
35
- for paragraph in doc.paragraphs:
36
- text_content.append(paragraph.text)
37
- return "\n".join(text_content)
38
  except Exception as e:
39
- st.error(f"Error in DOCX extraction: {str(e)}")
40
  return ""
41
 
 
 
 
 
 
 
 
 
 
42
  def parse_date(date_str):
43
- """Parse date from various formats."""
44
  try:
45
- # Handle 'Present' or 'Current'
46
- if date_str.lower() in ['present', 'current', 'now']:
47
  return datetime.now()
48
-
49
  date_str = date_str.strip()
50
-
51
- formats = [
52
- '%Y', '%b %Y', '%B %Y', '%m/%Y', '%m-%Y',
53
- '%Y/%m', '%Y-%m'
54
- ]
55
-
56
  for fmt in formats:
57
  try:
58
  return datetime.strptime(date_str, fmt)
59
- except ValueError:
60
  continue
61
-
62
- year_match = re.search(r'\b20\d{2}\b', date_str)
63
  if year_match:
64
- return datetime.strptime(year_match.group(), '%Y')
65
-
66
  return None
67
- except Exception:
68
  return None
69
 
70
  def calculate_experience(work_history):
71
- """Calculate total years of experience from work history."""
72
- total_experience = 0
73
- current_year = datetime.now().year
74
-
75
  for job in work_history:
76
- duration = job.get('duration', '')
77
  if not duration:
78
  continue
79
-
80
- parts = re.split(r'\s*-\s*|\s+to\s+', duration)
81
  if len(parts) != 2:
82
  continue
 
 
 
 
 
83
 
84
- start_date = parse_date(parts[0])
85
- end_date = parse_date(parts[1])
86
-
87
- if start_date and end_date:
88
- years = (end_date.year - start_date.year) + \
89
- (end_date.month - start_date.month) / 12
90
- total_experience += max(0, years)
91
-
92
- return round(total_experience, 1)
93
-
94
  def parse_resume(file_uploaded, api_key):
95
- """Parse resume and extract information."""
96
  genai.configure(api_key=api_key)
97
- model = genai.GenerativeModel('gemini-1.5-flash')
98
 
99
  prompt = """Extract the following information from this resume:
100
- 1. Summarize the following resume in 100 words, focusing on key skills, experience, and qualifications
101
- 2. Full Name
102
- 3. Email Address
103
- 4. Phone Number
104
- 5. Education History (including degree, institution, graduation year, and field of study)
105
- 6. Companies worked at with positions and EXACT duration (e.g., "Jan 2020 - Present" or "2018-2020")
106
- 7. Skills
107
- 8. LinkedIn Profile URL
108
-
109
- Return the information in this JSON format:
110
- {
111
- "summary": "",
112
- "name": "",
113
- "email": "",
114
- "phone": "",
115
- "education": [
116
- {
117
- "degree": "",
118
- "institution": "",
119
- "year": "",
120
- "field": "",
121
- "gpa": ""
122
- }
123
- ],
124
- "work_experience": [
125
- {
126
- "company": "",
127
- "position": "",
128
- "duration": ""
129
- }
130
- ],
131
- "skills": [],
132
- "linkedin": ""
133
- }
134
- For skills include tools and technologies in output if present any in resume.
135
- For work experience durations, please specify exact dates in format: "MMM YYYY - MMM YYYY" or "YYYY - Present" , please return in one order either in ascending or descending.
136
- Only return the JSON object, nothing else. If any field is not found, leave it empty."""
137
 
138
- try:
139
- file_extension = Path(file_uploaded.name).suffix.lower()
140
-
141
- if file_extension == '.pdf':
142
- text_content = extract_text_from_pdf(file_uploaded)
143
- elif file_extension in ['.docx', '.doc']:
144
- text_content = extract_text_from_docx(file_uploaded)
145
- elif file_extension in ['.jpg', '.jpeg', '.png']:
146
- image = Image.open(file_uploaded)
147
- text_content = pytesseract.image_to_string(image)
148
- else:
149
- st.error(f"Unsupported file format: {file_extension}")
150
- return None
151
 
 
 
152
  response = model.generate_content(f"{prompt}\n\nResume Text:\n{text_content}")
153
-
154
- try:
155
- response_text = response.text
156
- json_start = response_text.find('{')
157
- json_end = response_text.rfind('}') + 1
158
- json_str = response_text[json_start:json_end]
159
-
160
- result = json.loads(json_str)
161
- total_exp = calculate_experience(result.get('work_experience', []))
162
- result['total_years_experience'] = total_exp
163
-
164
- return result
165
- except json.JSONDecodeError as e:
166
- st.error(f"Error parsing response: {str(e)}")
167
- return None
168
 
 
 
169
  except Exception as e:
170
- st.error(f"Error processing resume: {str(e)}")
171
  return None
172
 
 
173
  def format_education(edu):
174
- """Format education details for display."""
175
  parts = []
176
- if edu.get('degree'):
177
- parts.append(edu['degree'])
178
- if edu.get('field'):
179
  parts.append(f"in {edu['field']}")
180
- if edu.get('institution'):
181
  parts.append(f"from {edu['institution']}")
182
- if edu.get('year'):
183
  parts.append(f"({edu['year']})")
184
- if edu.get('gpa') and edu['gpa'].strip():
185
  parts.append(f"- GPA: {edu['gpa']}")
186
  return " ".join(parts)
187
 
 
188
  def main():
189
- st.title("Resume Parser")
190
- st.write("Upload a resume (PDF, DOCX, or Image) to extract information")
191
-
192
- # Get API key from secrets or user input
193
  api_key = os.getenv("GEMINI_API_KEY") or st.text_input("Enter Gemini API Key", type="password")
194
-
195
-
196
- uploaded_file = st.file_uploader("Choose a resume file", type=["pdf", "docx", "doc", "jpg", "jpeg", "png"])
197
 
198
  if uploaded_file and api_key:
199
- with st.spinner('Analyzing resume...'):
200
  result = parse_resume(uploaded_file, api_key)
201
 
202
- if result:
203
- st.subheader("Extracted Information")
204
-
205
- # Display summary in a text area
206
- st.text_area("Summary", result.get('summary', 'Not found'), height=100)
207
-
208
- # Display personal information
209
- col1, col2, col3 = st.columns(3)
210
- with col1:
211
- st.write("**Name:**", result.get('name', 'Not found'))
212
- with col2:
213
- st.write("**Email:**", result.get('email', 'Not found'))
214
- with col3:
215
- st.write("**Phone:**", result.get('phone', 'Not found'))
216
-
217
- # Display total experience
218
- total_exp = result.get('total_years_experience', 0)
219
- exp_text = f"{total_exp:.1f} years" if total_exp >= 1 else f"{total_exp * 12:.0f} months"
220
- st.write("**Total Experience:**", exp_text)
221
-
222
- # Display education
223
- st.subheader("Education")
224
- if result.get('education'):
225
- for edu in result['education']:
226
- st.write(f"- {format_education(edu)}")
227
- else:
228
- st.write("No education information found")
229
-
230
- # Display work experience
231
- st.subheader("Work Experience")
232
- if result.get('work_experience'):
233
- for exp in result['work_experience']:
234
- duration = f" ({exp.get('duration', 'Duration not specified')})" if exp.get('duration') else ""
235
- st.write(f"- {exp.get('position', 'Role not found')} at {exp.get('company', 'Company not found')}{duration}")
236
- else:
237
- st.write("No work experience found")
238
-
239
- # Display Skills
240
- st.subheader("Skills:")
241
- if result.get('skills'):
242
- for skill in result['skills']:
243
- st.write(f"- {skill}")
244
- else:
245
- st.write("- No skills found")
246
-
247
- # Display LinkedIn profile
248
- st.write("**LinkedIn Profile:**", result.get('linkedin', 'Not found'))
249
 
250
  if __name__ == "__main__":
251
- main()
 
1
+ # === app.py ===
2
+
3
  import os
4
+ os.environ["STREAMLIT_HOME"] = "/app/.streamlit" # Must be first
5
+
6
  import streamlit as st
7
  import google.generativeai as genai
8
  from PIL import Image
9
  import fitz # PyMuPDF
10
  from docx import Document
11
+ import pytesseract
12
+ import io
13
  import json
14
  from pathlib import Path
15
  from datetime import datetime
16
  import re
 
 
 
17
 
18
+ # ---------------- PDF / DOCX / IMAGE EXTRACTION ----------------
19
  def extract_text_from_pdf(pdf_file):
20
+ """Extract text from PDF, with OCR fallback for scanned PDFs."""
21
  text_content = []
22
+ pdf_bytes = pdf_file.read()
23
  try:
 
24
  doc = fitz.open(stream=pdf_bytes, filetype="pdf")
25
+ for page in doc:
26
+ page_text = page.get_text()
27
+ if not page_text.strip():
28
+ # Fallback to OCR
29
+ pix = page.get_pixmap()
30
+ img = Image.open(io.BytesIO(pix.tobytes("png")))
31
+ page_text = pytesseract.image_to_string(img)
32
+ text_content.append(page_text)
33
  return "\n".join(text_content)
34
  except Exception as e:
35
+ st.error(f"PDF extraction error: {str(e)}")
36
  return ""
37
 
38
  def extract_text_from_docx(docx_file):
 
39
  try:
40
  doc = Document(docx_file)
41
+ return "\n".join([p.text for p in doc.paragraphs])
 
 
 
42
  except Exception as e:
43
+ st.error(f"DOCX extraction error: {str(e)}")
44
  return ""
45
 
46
+ def extract_text_from_image(image_file):
47
+ try:
48
+ image = Image.open(image_file)
49
+ return pytesseract.image_to_string(image)
50
+ except Exception as e:
51
+ st.error(f"Image extraction error: {str(e)}")
52
+ return ""
53
+
54
+ # ---------------- DATE / EXPERIENCE CALCULATION ----------------
55
  def parse_date(date_str):
 
56
  try:
57
+ if date_str.lower() in ["present", "current", "now"]:
 
58
  return datetime.now()
 
59
  date_str = date_str.strip()
60
+ formats = ["%Y", "%b %Y", "%B %Y", "%m/%Y", "%m-%Y", "%Y/%m", "%Y-%m"]
 
 
 
 
 
61
  for fmt in formats:
62
  try:
63
  return datetime.strptime(date_str, fmt)
64
+ except:
65
  continue
66
+ year_match = re.search(r"\b20\d{2}\b", date_str)
 
67
  if year_match:
68
+ return datetime.strptime(year_match.group(), "%Y")
 
69
  return None
70
+ except:
71
  return None
72
 
73
  def calculate_experience(work_history):
74
+ total_exp = 0
 
 
 
75
  for job in work_history:
76
+ duration = job.get("duration", "")
77
  if not duration:
78
  continue
79
+ parts = re.split(r"\s*-\s*|\s+to\s+", duration)
 
80
  if len(parts) != 2:
81
  continue
82
+ start, end = parse_date(parts[0]), parse_date(parts[1])
83
+ if start and end:
84
+ years = (end.year - start.year) + (end.month - start.month)/12
85
+ total_exp += max(0, years)
86
+ return round(total_exp, 1)
87
 
88
+ # ---------------- RESUME PARSING ----------------
 
 
 
 
 
 
 
 
 
89
  def parse_resume(file_uploaded, api_key):
 
90
  genai.configure(api_key=api_key)
91
+ model = genai.GenerativeModel("gemini-1.5-flash")
92
 
93
  prompt = """Extract the following information from this resume:
94
+ 1. Summarize in 100 words, focus on skills, experience, qualifications.
95
+ 2. Full Name
96
+ 3. Email
97
+ 4. Phone
98
+ 5. Education (degree, institution, year, field)
99
+ 6. Work experience with exact duration (e.g., Jan 2020 - Present)
100
+ 7. Skills
101
+ 8. LinkedIn URL
102
+
103
+ Return as JSON:
104
+ {
105
+ "summary": "", "name": "", "email": "", "phone": "",
106
+ "education": [{"degree": "", "institution": "", "year": "", "field": "", "gpa": ""}],
107
+ "work_experience": [{"company": "", "position": "", "duration": ""}],
108
+ "skills": [], "linkedin": ""
109
+ }"""
110
+
111
+ # Extract text
112
+ ext = Path(file_uploaded.name).suffix.lower()
113
+ if ext == ".pdf":
114
+ text_content = extract_text_from_pdf(file_uploaded)
115
+ elif ext in [".docx", ".doc"]:
116
+ text_content = extract_text_from_docx(file_uploaded)
117
+ elif ext in [".jpg", ".jpeg", ".png"]:
118
+ text_content = extract_text_from_image(file_uploaded)
119
+ else:
120
+ st.error(f"Unsupported file type: {ext}")
121
+ return None
 
 
 
 
 
 
 
 
 
122
 
123
+ if not text_content.strip():
124
+ st.error("No text found in resume.")
125
+ return None
 
 
 
 
 
 
 
 
 
 
126
 
127
+ # Generate JSON from Gemini
128
+ try:
129
  response = model.generate_content(f"{prompt}\n\nResume Text:\n{text_content}")
130
+ response_text = response.text
131
+ st.text_area("Raw Response", response_text, height=200) # Debugging
132
+
133
+ # Extract JSON
134
+ json_start = response_text.find("{")
135
+ json_end = response_text.rfind("}") + 1
136
+ json_str = response_text[json_start:json_end]
137
+ result = json.loads(json_str)
 
 
 
 
 
 
 
138
 
139
+ result["total_years_experience"] = calculate_experience(result.get("work_experience", []))
140
+ return result
141
  except Exception as e:
142
+ st.error(f"Error parsing resume: {str(e)}")
143
  return None
144
 
145
+ # ---------------- FORMAT EDUCATION ----------------
146
  def format_education(edu):
 
147
  parts = []
148
+ if edu.get("degree"):
149
+ parts.append(edu["degree"])
150
+ if edu.get("field"):
151
  parts.append(f"in {edu['field']}")
152
+ if edu.get("institution"):
153
  parts.append(f"from {edu['institution']}")
154
+ if edu.get("year"):
155
  parts.append(f"({edu['year']})")
156
+ if edu.get("gpa"):
157
  parts.append(f"- GPA: {edu['gpa']}")
158
  return " ".join(parts)
159
 
160
+ # ---------------- MAIN APP ----------------
161
  def main():
162
+ st.title("Resume Parser (PDF/DOCX/Image)")
 
 
 
163
  api_key = os.getenv("GEMINI_API_KEY") or st.text_input("Enter Gemini API Key", type="password")
164
+ uploaded_file = st.file_uploader("Choose a resume file", type=["pdf","docx","doc","jpg","jpeg","png"])
 
 
165
 
166
  if uploaded_file and api_key:
167
+ with st.spinner("Analyzing resume..."):
168
  result = parse_resume(uploaded_file, api_key)
169
 
170
+ if result:
171
+ st.subheader("Extracted Information")
172
+ st.text_area("Summary", result.get("summary",""), height=100)
173
+
174
+ col1, col2, col3 = st.columns(3)
175
+ col1.write("**Name:** "+result.get("name",""))
176
+ col2.write("**Email:** "+result.get("email",""))
177
+ col3.write("**Phone:** "+result.get("phone",""))
178
+
179
+ exp = result.get("total_years_experience",0)
180
+ exp_text = f"{exp:.1f} years" if exp >= 1 else f"{exp*12:.0f} months"
181
+ st.write("**Total Experience:**", exp_text)
182
+
183
+ st.subheader("Education")
184
+ for edu in result.get("education", []):
185
+ st.write("- "+format_education(edu))
186
+
187
+ st.subheader("Work Experience")
188
+ for w in result.get("work_experience", []):
189
+ dur = f" ({w.get('duration','')})" if w.get("duration") else ""
190
+ st.write(f"- {w.get('position','')} at {w.get('company','')}{dur}")
191
+
192
+ st.subheader("Skills")
193
+ for s in result.get("skills", []):
194
+ st.write("- "+s)
195
+
196
+ st.write("**LinkedIn:**", result.get("linkedin",""))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
197
 
198
  if __name__ == "__main__":
199
+ main()