Spaces:

bangaboy
/

pp

Sleeping

App Files Files Community

bangaboy commited on Sep 8, 2025

Commit

de06fc0

verified ·

1 Parent(s): 10f784f

Update src/streamlit_app.py

Browse files

Files changed (1) hide show

src/streamlit_app.py +130 -299

src/streamlit_app.py CHANGED Viewed

@@ -7,87 +7,22 @@ import json
 from pathlib import Path
 from datetime import datetime
 import re
-import os
 import io
-import tempfile
-import sys
-# Set page config first
-st.set_page_config(
-    page_title="Resume Parser",
-    page_icon="📄",
-    layout="wide"
-)
-# Handle Streamlit configuration for Hugging Face Spaces
-if not os.path.exists(os.path.expanduser("~/.streamlit")):
-    try:
-        os.makedirs(os.path.expanduser("~/.streamlit"), exist_ok=True)
-    except:
-        pass  # Ignore permission errors
 def extract_text_from_pdf(pdf_file):
-    """Extract text from uploaded PDF file with better error handling."""
     try:
-        # Read PDF bytes
         pdf_bytes = pdf_file.read()
-        pdf_file.seek(0)  # Reset file pointer
-        # Debug info
-        st.write(f"📖 PDF file size: {len(pdf_bytes)} bytes")
-        if len(pdf_bytes) == 0:
-            st.error("❌ PDF file is empty")
-            return ""
-        # Create temporary file for PyMuPDF
-        with tempfile.NamedTemporaryFile(suffix='.pdf', delete=False) as tmp_file:
-            tmp_file.write(pdf_bytes)
-            tmp_file_path = tmp_file.name
-        try:
-            # Open PDF document
-            doc = fitz.open(tmp_file_path)
-            st.write(f"📄 PDF has {len(doc)} pages")
-            text_content = []
-            for page_num in range(len(doc)):
-                page = doc[page_num]
-                page_text = page.get_text()
-                text_content.append(page_text)
-                st.write(f"Page {page_num + 1}: {len(page_text)} characters")
-            doc.close()
-            full_text = "\n".join(text_content)
-            st.write(f"✅ Total text extracted: {len(full_text)} characters")
-            return full_text
-        finally:
-            # Clean up temporary file
-            try:
-                os.unlink(tmp_file_path)
-            except:
-                pass
     except Exception as e:
-        st.error(f"❌ PDF extraction error: {str(e)}")
-        # Try alternative approach with stream
-        try:
-            pdf_file.seek(0)
-            pdf_bytes = pdf_file.read()
-            doc = fitz.open(stream=pdf_bytes, filetype="pdf")
-            text_content = []
-            for page_num in range(len(doc)):
-                page = doc[page_num]
-                text_content.append(page.get_text())
-            doc.close()
-            return "\n".join(text_content)
-        except Exception as e2:
-            st.error(f"❌ Alternative PDF extraction also failed: {str(e2)}")
-            return ""
 def extract_text_from_docx(docx_file):
     """Extract text from uploaded DOCX file."""
@@ -96,21 +31,20 @@ def extract_text_from_docx(docx_file):
         text_content = []
         for paragraph in doc.paragraphs:
             text_content.append(paragraph.text)
-        full_text = "\n".join(text_content)
-        st.write(f"✅ DOCX text extracted: {len(full_text)} characters")
-        return full_text
     except Exception as e:
-        st.error(f"❌ DOCX extraction error: {str(e)}")
         return ""
 def parse_date(date_str):
     """Parse date from various formats."""
     try:
         if date_str.lower() in ['present', 'current', 'now']:
             return datetime.now()
         date_str = date_str.strip()
         formats = [
             '%Y', '%b %Y', '%B %Y', '%m/%Y', '%m-%Y',
             '%Y/%m', '%Y-%m'
@@ -133,7 +67,8 @@ def parse_date(date_str):
 def calculate_experience(work_history):
     """Calculate total years of experience from work history."""
     total_experience = 0
     for job in work_history:
         duration = job.get('duration', '')
         if not duration:
@@ -153,107 +88,87 @@ def calculate_experience(work_history):
     return round(total_experience, 1)
-def get_api_key():
-    """Get API key from environment or user input."""
-    # Try environment variable (Hugging Face Spaces)
-    api_key = os.getenv("GEMINI_API_KEY")
-    if api_key:
-        return api_key
-    # Try Streamlit secrets (fallback)
-    try:
-        if hasattr(st, 'secrets') and st.secrets and "GEMINI_API_KEY" in st.secrets:
-            return st.secrets["GEMINI_API_KEY"]
-    except:
-        pass
-    return None
-def parse_resume_with_gemini(text_content, api_key):
-    """Use Gemini API to parse resume text."""
     try:
-        # Configure API
-        genai.configure(api_key=api_key)
-        model = genai.GenerativeModel('gemini-1.5-flash')
-        prompt = """You are a resume parser. Extract the following information from this resume text and return ONLY a valid JSON object:
-{
-    "summary": "Brief 100-word summary of the candidate",
-    "name": "Full name",
-    "email": "Email address",
-    "phone": "Phone number",
-    "education": [
-        {
-            "degree": "Degree name",
-            "institution": "School/University name",
-            "year": "Graduation year",
-            "field": "Field of study",
-            "gpa": "GPA if mentioned"
-        }
-    ],
-    "work_experience": [
-        {
-            "company": "Company name",
-            "position": "Job title",
-            "duration": "Employment period (e.g., Jan 2020 - Present)"
-        }
-    ],
-    "skills": ["List of skills, tools, technologies"],
-    "linkedin": "LinkedIn profile URL"
-}
-Rules:
-- Return ONLY valid JSON, no other text
-- If information is not found, use empty string or empty array
-- For dates, use format like "Jan 2020 - Dec 2022" or "2020 - Present"
-- Include all technical skills, programming languages, tools mentioned
-Resume text:
-"""
-        # Make API call
-        response = model.generate_content(prompt + text_content)
-        if not response or not response.text:
-            st.error("❌ Empty response from Gemini API")
             return None
-        # Clean and parse JSON
-        response_text = response.text.strip()
-        # Find JSON in response
-        json_start = response_text.find('{')
-        json_end = response_text.rfind('}') + 1
-        if json_start == -1 or json_end <= json_start:
-            st.error("❌ No valid JSON found in response")
-            st.code(response_text[:500])
             return None
-        json_str = response_text[json_start:json_end]
-        # Parse JSON
-        result = json.loads(json_str)
-        # Add calculated experience
-        total_exp = calculate_experience(result.get('work_experience', []))
-        result['total_years_experience'] = total_exp
-        return result
-    except json.JSONDecodeError as e:
-        st.error(f"❌ JSON parsing error: {e}")
-        st.code(json_str if 'json_str' in locals() else "No JSON extracted")
-        return None
     except Exception as e:
-        st.error(f"❌ API error: {e}")
-        if "API_KEY" in str(e) or "key" in str(e).lower():
-            st.error("🔑 Please check your Gemini API key")
         return None
 def format_education(edu):
-    """Format education for display."""
     parts = []
     if edu.get('degree'):
         parts.append(edu['degree'])
@@ -265,152 +180,68 @@ def format_education(edu):
         parts.append(f"({edu['year']})")
     if edu.get('gpa') and edu['gpa'].strip():
         parts.append(f"- GPA: {edu['gpa']}")
-    return " ".join(parts) if parts else "No details available"
 def main():
-    st.title("🔍 Resume Parser")
-    st.markdown("Upload a resume (PDF, DOCX, or Image) to extract structured information")
-    # API Key handling
-    api_key = get_api_key()
-    if not api_key:
-        st.warning("⚠️ No API key found in environment variables")
-        api_key = st.text_input(
-            "Enter your Gemini API Key:",
-            type="password",
-            help="Get your API key from https://makersuite.google.com/app/apikey"
-        )
-        if not api_key:
-            st.info("👆 Please enter your Gemini API key to continue")
-            st.stop()
-    else:
-        st.success("✅ API key loaded from environment")
-    # File upload
-    uploaded_file = st.file_uploader(
-        "Choose a resume file",
-        type=["pdf", "docx", "doc", "jpg", "jpeg", "png"],
-        help="Supported formats: PDF, DOCX, DOC, JPG, JPEG, PNG"
-    )
-    if uploaded_file is not None:
-        st.write(f"📁 File uploaded: **{uploaded_file.name}** ({uploaded_file.size} bytes)")
-        # Process file
-        with st.spinner("🔄 Processing resume..."):
-            # Extract text based on file type
-            file_extension = Path(uploaded_file.name).suffix.lower()
-            text_content = ""
-            if file_extension == '.pdf':
-                text_content = extract_text_from_pdf(uploaded_file)
-            elif file_extension in ['.docx', '.doc']:
-                text_content = extract_text_from_docx(uploaded_file)
-            elif file_extension in ['.jpg', '.jpeg', '.png']:
-                st.info("📷 Image processing requires OCR - this may take longer")
-                try:
-                    image = Image.open(uploaded_file)
-                    import pytesseract
-                    text_content = pytesseract.image_to_string(image)
-                    st.write(f"✅ OCR extracted: {len(text_content)} characters")
-                except ImportError:
-                    st.error("❌ OCR not available. Please use PDF or DOCX files.")
-                    return
-                except Exception as e:
-                    st.error(f"❌ OCR error: {e}")
-                    return
-            else:
-                st.error(f"❌ Unsupported file type: {file_extension}")
-                return
-            # Check if text was extracted
-            if not text_content or len(text_content.strip()) < 20:
-                st.error("❌ Could not extract meaningful text from the file")
-                st.info("💡 Try using a different file format or ensure the file contains readable text")
-                if text_content:
-                    st.write("Extracted text preview:")
-                    st.code(text_content[:200])
-                return
-            # Show text preview
-            with st.expander("📄 View extracted text (first 300 characters)"):
-                st.code(text_content[:300] + "..." if len(text_content) > 300 else text_content)
-            # Parse with Gemini
-            st.write("🤖 Analyzing with AI...")
-            result = parse_resume_with_gemini(text_content, api_key)
             if result:
-                st.success("✅ Resume parsed successfully!")
-                # Display results
-                st.header("📊 Extracted Information")
-                # Summary
-                if result.get('summary'):
-                    st.subheader("📝 Summary")
-                    st.write(result['summary'])
-                # Personal info
-                st.subheader("👤 Personal Information")
                 col1, col2, col3 = st.columns(3)
                 with col1:
-                    st.metric("Name", result.get('name', 'Not found'))
                 with col2:
-                    st.metric("Email", result.get('email', 'Not found'))
                 with col3:
-                    st.metric("Phone", result.get('phone', 'Not found'))
-                # Experience
                 total_exp = result.get('total_years_experience', 0)
-                if total_exp > 0:
-                    exp_text = f"{total_exp:.1f} years" if total_exp >= 1 else f"{total_exp * 12:.0f} months"
-                    st.metric("Total Experience", exp_text)
-                # Education
-                st.subheader("🎓 Education")
                 if result.get('education'):
                     for edu in result['education']:
-                        st.write(f"• {format_education(edu)}")
                 else:
                     st.write("No education information found")
-                # Work Experience
-                st.subheader("💼 Work Experience")
                 if result.get('work_experience'):
                     for exp in result['work_experience']:
                         duration = f" ({exp.get('duration', 'Duration not specified')})" if exp.get('duration') else ""
-                        st.write(f"• **{exp.get('position', 'Position not found')}** at {exp.get('company', 'Company not found')}{duration}")
                 else:
                     st.write("No work experience found")
-                # Skills
-                st.subheader("🛠️ Skills")
                 if result.get('skills'):
-                    # Display skills as tags
-                    skills_text = " • ".join(result['skills'])
-                    st.write(skills_text)
                 else:
-                    st.write("No skills found")
-                # LinkedIn
-                if result.get('linkedin'):
-                    st.subheader("🔗 LinkedIn Profile")
-                    st.write(result['linkedin'])
-                # Download results
-                st.subheader("💾 Download Results")
-                json_str = json.dumps(result, indent=2)
-                st.download_button(
-                    label="Download JSON",
-                    data=json_str,
-                    file_name=f"resume_parsed_{uploaded_file.name}.json",
-                    mime="application/json"
-                )
-            else:
-                st.error("❌ Failed to parse resume. Please check the file and try again.")
 if __name__ == "__main__":
     main()

 from pathlib import Path
 from datetime import datetime
 import re
+import pytesseract
 import io
 def extract_text_from_pdf(pdf_file):
+    """Extract text from uploaded PDF file."""
+    text_content = []
     try:
         pdf_bytes = pdf_file.read()
+        doc = fitz.open(stream=pdf_bytes, filetype="pdf")
+        for page_num in range(len(doc)):
+            page = doc[page_num]
+            text_content.append(page.get_text())
+        return "\n".join(text_content)
     except Exception as e:
+        st.error(f"Error in PDF extraction: {str(e)}")
+        return ""
 def extract_text_from_docx(docx_file):
     """Extract text from uploaded DOCX file."""
         text_content = []
         for paragraph in doc.paragraphs:
             text_content.append(paragraph.text)
+        return "\n".join(text_content)
     except Exception as e:
+        st.error(f"Error in DOCX extraction: {str(e)}")
         return ""
 def parse_date(date_str):
     """Parse date from various formats."""
     try:
+        # Handle 'Present' or 'Current'
         if date_str.lower() in ['present', 'current', 'now']:
             return datetime.now()
         date_str = date_str.strip()
         formats = [
             '%Y', '%b %Y', '%B %Y', '%m/%Y', '%m-%Y',
             '%Y/%m', '%Y-%m'
 def calculate_experience(work_history):
     """Calculate total years of experience from work history."""
     total_experience = 0
+    current_year = datetime.now().year
     for job in work_history:
         duration = job.get('duration', '')
         if not duration:
     return round(total_experience, 1)
+def parse_resume(file_uploaded, api_key):
+    """Parse resume and extract information."""
+    genai.configure(api_key=api_key)
+    model = genai.GenerativeModel('gemini-1.5-flash')
+    prompt = """Extract the following information from this resume:
+    1. Summarize the following resume in 100 words, focusing on key skills, experience, and qualifications
+    2. Full Name
+    3. Email Address
+    4. Phone Number
+    5. Education History (including degree, institution, graduation year, and field of study)
+    6. Companies worked at with positions and EXACT duration (e.g., "Jan 2020 - Present" or "2018-2020")
+    7. Skills
+    8. LinkedIn Profile URL
+    Return the information in this JSON format:
+    {
+        "summary": "",
+        "name": "",
+        "email": "",
+        "phone": "",
+        "education": [
+            {
+                "degree": "",
+                "institution": "",
+                "year": "",
+                "field": "",
+                "gpa": ""
+            }
+        ],
+        "work_experience": [
+            {
+                "company": "",
+                "position": "",
+                "duration": ""
+            }
+        ],
+        "skills": [],
+        "linkedin": ""
+    }
+    For skills include tools and technologies in output if present any in resume.
+    For work experience durations, please specify exact dates in format: "MMM YYYY - MMM YYYY" or "YYYY - Present" , please return in one order either in ascending or descending.
+    Only return the JSON object, nothing else. If any field is not found, leave it empty."""
     try:
+        file_extension = Path(file_uploaded.name).suffix.lower()
+        if file_extension == '.pdf':
+            text_content = extract_text_from_pdf(file_uploaded)
+        elif file_extension in ['.docx', '.doc']:
+            text_content = extract_text_from_docx(file_uploaded)
+        elif file_extension in ['.jpg', '.jpeg', '.png']:
+            image = Image.open(file_uploaded)
+            text_content = pytesseract.image_to_string(image)
+        else:
+            st.error(f"Unsupported file format: {file_extension}")
             return None
+        response = model.generate_content(f"{prompt}\n\nResume Text:\n{text_content}")
+        try:
+            response_text = response.text
+            json_start = response_text.find('{')
+            json_end = response_text.rfind('}') + 1
+            json_str = response_text[json_start:json_end]
+            result = json.loads(json_str)
+            total_exp = calculate_experience(result.get('work_experience', []))
+            result['total_years_experience'] = total_exp
+            return result
+        except json.JSONDecodeError as e:
+            st.error(f"Error parsing response: {str(e)}")
             return None
     except Exception as e:
+        st.error(f"Error processing resume: {str(e)}")
         return None
 def format_education(edu):
+    """Format education details for display."""
     parts = []
     if edu.get('degree'):
         parts.append(edu['degree'])
         parts.append(f"({edu['year']})")
     if edu.get('gpa') and edu['gpa'].strip():
         parts.append(f"- GPA: {edu['gpa']}")
+    return " ".join(parts)
 def main():
+    st.title("Resume Parser")
+    st.write("Upload a resume (PDF, DOCX, or Image) to extract information")
+    # Get API key from secrets or user input
+    api_key = st.secrets["GEMINI_API_KEY"] if "GEMINI_API_KEY" in st.secrets else st.text_input("Enter Gemini API Key", type="password")
+    uploaded_file = st.file_uploader("Choose a resume file", type=["pdf", "docx", "doc", "jpg", "jpeg", "png"])
+    if uploaded_file and api_key:
+        with st.spinner('Analyzing resume...'):
+            result = parse_resume(uploaded_file, api_key)
             if result:
+                st.subheader("Extracted Information")
+                # Display summary in a text area
+                st.text_area("Summary", result.get('summary', 'Not found'), height=100)
+                # Display personal information
                 col1, col2, col3 = st.columns(3)
                 with col1:
+                    st.write("**Name:**", result.get('name', 'Not found'))
                 with col2:
+                    st.write("**Email:**", result.get('email', 'Not found'))
                 with col3:
+                    st.write("**Phone:**", result.get('phone', 'Not found'))
+                # Display total experience
                 total_exp = result.get('total_years_experience', 0)
+                exp_text = f"{total_exp:.1f} years" if total_exp >= 1 else f"{total_exp * 12:.0f} months"
+                st.write("**Total Experience:**", exp_text)
+                # Display education
+                st.subheader("Education")
                 if result.get('education'):
                     for edu in result['education']:
+                        st.write(f"- {format_education(edu)}")
                 else:
                     st.write("No education information found")
+                # Display work experience
+                st.subheader("Work Experience")
                 if result.get('work_experience'):
                     for exp in result['work_experience']:
                         duration = f" ({exp.get('duration', 'Duration not specified')})" if exp.get('duration') else ""
+                        st.write(f"- {exp.get('position', 'Role not found')} at {exp.get('company', 'Company not found')}{duration}")
                 else:
                     st.write("No work experience found")
+                # Display Skills
+                st.subheader("Skills:")
                 if result.get('skills'):
+                    for skill in result['skills']:
+                        st.write(f"- {skill}")
                 else:
+                    st.write("- No skills found")
+                # Display LinkedIn profile
+                st.write("**LinkedIn Profile:**", result.get('linkedin', 'Not found'))
 if __name__ == "__main__":
     main()