aqsa-123's picture
Update app.py
c1e0635 verified
# IMPORTS
import io
import re
from PIL import Image
import gradio as gr
import pdfplumber
from docx import Document
# Hugging Face OCR model imports
from transformers import TrOCRProcessor, VisionEncoderDecoderModel
import torch
# ---------------- HF OCR SETUP ----------------
processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-stage1")
model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-stage1")
def extract_text_from_image(file_bytes):
"""Extract text from image using HF TrOCR model"""
image = Image.open(io.BytesIO(file_bytes)).convert("RGB")
pixel_values = processor(images=image, return_tensors="pt").pixel_values
generated_ids = model.generate(pixel_values)
text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
return text
# ---------------- PDF TEXT ----------------
def extract_text_from_pdf(file_bytes):
text = ""
with pdfplumber.open(io.BytesIO(file_bytes)) as pdf:
for page in pdf.pages:
text += page.extract_text() or ""
return text
# ---------------- DOCX TEXT ----------------
def extract_text_from_docx(file_bytes):
doc = Document(io.BytesIO(file_bytes))
return "\n".join([p.text for p in doc.paragraphs])
# ---------------- FILE HANDLER ----------------
def extract_text_from_file(file_bytes, file_ext):
if file_ext == "pdf":
return extract_text_from_pdf(file_bytes)
elif file_ext == "docx":
return extract_text_from_docx(file_bytes)
elif file_ext in ["jpg", "jpeg", "png"]:
return extract_text_from_image(file_bytes)
return ""
# ---------------- RESUME VALIDATION ----------------
def validate_resume(text):
text_lower = text.lower()
issues = []
# Contact Info
has_email = bool(re.search(r'[\w\.-]+@[\w\.-]+\.\w+', text_lower))
has_phone = bool(re.search(r'\d{3}[-.\s]?\d{3}[-.\s]?\d{4}|\d{10,11}', text_lower))
email_keywords = ['@gmail', '@yahoo', '@hotmail', '.com', 'email', 'e-mail']
has_any_email_keyword = any(k in text_lower for k in email_keywords)
if not (has_email or has_phone or has_any_email_keyword):
issues.append("Missing Contact Info")
# Education
if not re.search(r'(education|degree|bachelor|master|university|school|college|bs|ms|phd)', text_lower):
issues.append("Missing Education")
# Experience
if not re.search(r'(experience|worked|roles?|employment|projects?|internship|career|manager|designer|assistant|executive|specialist|developer|engineer|analyst|officer|coordinator)', text_lower):
issues.append("Missing Experience")
# Skills
if not re.search(r'(marketing|communication|skills|technologies|tools|competencies|python|excel|sql|java|c\+\+|javascript|html|css|react|node|git|linux|aws|docker|kubernetes|leadership|teamwork)', text_lower):
issues.append("Missing Skills")
return issues if issues else ["✅ Resume is Complete!"]
# ---------------- MAIN FUNCTION ----------------
def check_resume(file):
if file is None:
return "⚠️ Please upload a file", ""
try:
file_ext = file.name.split(".")[-1].lower()
# Gradio v3+ safe file reading
with open(file.name, "rb") as f:
file_bytes = f.read()
text = extract_text_from_file(file_bytes, file_ext)
if not text.strip():
return "⚠️ Could not extract text", ""
result = "\n".join(validate_resume(text))
return result, text[:1000] # show first 1000 chars
except Exception as e:
return f"⚠️ Error: {str(e)}", ""
# ---------------- GRADIO INTERFACE ----------------
demo = gr.Interface(
fn=check_resume,
inputs=gr.File(label="Upload Resume (PDF, DOCX, JPG, PNG)"),
outputs=[gr.Textbox(label="Result", lines=6),
gr.Textbox(label="Extracted Text", lines=6)],
title="📄 Resume Completeness Checker",
description="Upload clear resume files for better results."
)
demo.launch(share=True)