mkoot007's picture
Update app.py
80de488
raw
history blame
1.92 kB
import gradio as gr
import re
from docx import Document
from PyPDF2 import PdfReader # Use PdfReader from PyPDF2
# Function to extract text from a PDF file
def extract_text_from_pdf(pdf_file):
text = ""
pdf = PdfReader(pdf_file)
for page in pdf.pages:
text += page.extract_text()
return text
# Function to extract text from a DOCX file
def extract_text_from_docx(docx_file):
doc = Document(docx_file)
text = "\n".join([para.text for para in doc.paragraphs])
return text
def extract_text_from_pdf(pdf_file):
text = ""
pdf = PdfReader(pdf_file)
if not pdf.pages:
raise ValueError("The PDF file is empty.")
for page in pdf.pages:
text += page.extract_text()
return text
# Function to extract information from a resume
def extract_info_from_resume(resume_path):
if resume_path.name.endswith('.pdf'):
text = extract_text_from_pdf(resume_path)
elif resume_path.name.endswith('.docx'):
text = extract_text_from_docx(resume_path)
else:
raise ValueError("Unsupported file format. Only PDF and DOCX are supported.")
# Define regular expressions to extract information
name_pattern = r"([A-Z][a-z]+(?: [A-Z][a-z]+)+)"
email_pattern = r"[\w\.-]+@[\w\.-]+"
phone_pattern = r"(\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4})"
name = re.search(name_pattern, text)
email = re.search(email_pattern, text)
phone = re.search(phone_pattern, text)
extracted_info = {
"Name": name.group() if name else "Name not found",
"Email": email.group() if email else "Email not found",
"Phone": phone.group() if phone else "Phone number not found",
}
return extracted_info
# Define a Gradio interface
iface = gr.Interface(
fn=extract_info_from_resume,
inputs=gr.inputs.File(type="file"),
outputs="json"
)
# Deploy the Gradio interface
iface.launch(share=True)