Spaces:

Alpha108
/

MatchHive-ai

Sleeping

App Files Files Community

Alpha108 commited on Sep 27, 2025

Commit

be56e96

verified ·

1 Parent(s): 4dbd292

Create resume_parser.py

Browse files

Files changed (1) hide show

backend/agents/resume_parser.py +71 -0

backend/agents/resume_parser.py ADDED Viewed

	@@ -0,0 +1,71 @@

+import PyPDF2
+import docx
+import io
+def parse_pdf(file_stream):
+    """
+    Extracts text from a PDF file stream.
+    Args:
+        file_stream: A file-like object (e.g., from st.file_uploader).
+    Returns:
+        str: The extracted text from the PDF.
+    """
+    text = ""
+    try:
+        reader = PyPDF2.PdfReader(file_stream)
+        for page in reader.pages:
+            text += page.extract_text() or ""
+    except Exception as e:
+        print(f"Error reading PDF: {e}")
+        raise ValueError("Could not parse the PDF file. It might be corrupted or image-based.")
+    return text
+def parse_docx(file_stream):
+    """
+    Extracts text from a DOCX file stream.
+    Args:
+        file_stream: A file-like object.
+    Returns:
+        str: The extracted text from the DOCX file.
+    """
+    text = ""
+    try:
+        doc = docx.Document(file_stream)
+        for para in doc.paragraphs:
+            text += para.text + "\n"
+    except Exception as e:
+        print(f"Error reading DOCX: {e}")
+        raise ValueError("Could not parse the DOCX file.")
+    return text
+def parse_resume(uploaded_file):
+    """
+    Parses an uploaded resume file (PDF or DOCX) and returns its text content.
+    Args:
+        uploaded_file: The file object from Streamlit's file_uploader.
+    Returns:
+        str: The text content of the resume.
+    Raises:
+        ValueError: If the file type is not supported or parsing fails.
+    """
+    if uploaded_file is None:
+        raise ValueError("No file uploaded.")
+    file_extension = uploaded_file.name.split('.')[-1].lower()
+    # We use BytesIO to handle the file in memory
+    file_stream = io.BytesIO(uploaded_file.getvalue())
+    if file_extension == 'pdf':
+        return parse_pdf(file_stream)
+    elif file_extension == 'docx':
+        return parse_docx(file_stream)
+    else:
+        raise ValueError(f"Unsupported file type: '{file_extension}'. Please upload a PDF or DOCX file.")