resume-parser-2 / app.py
sejalkishan's picture
Update app.py
5325f14 verified
import gradio as gr
import pdfplumber
import docx
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
from huggingface_hub import login
import pytesseract
import torch
import os
import spaces
import re
# Authenticate Hugging Face token
login(token=os.environ.get("token"))
# Ensure GPU is available
if not torch.cuda.is_available():
raise RuntimeError("❌ GPU not detected! Please enable GPU in Space settings.")
print(f"βœ… Using GPU: {torch.cuda.get_device_name(0)}")
# Model
model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
# Document extractors
def extract_text_from_pdf(file):
text = ""
with pdfplumber.open(file) as pdf:
for page in pdf.pages:
page_text = page.extract_text()
if page_text:
text += page_text + "\n"
else:
img = page.to_image(resolution=300).original
ocr_text = pytesseract.image_to_string(img)
text += ocr_text + "\n"
return text
def extract_text_from_docx(file):
doc = docx.Document(file)
return "\n".join([para.text for para in doc.paragraphs if para.text.strip() != ""])
def chunk_text(text, max_chars=6000):
paragraphs = text.split("\n")
chunks, current_chunk = [], ""
for para in paragraphs:
if len(current_chunk) + len(para) < max_chars:
current_chunk += para + "\n"
else:
chunks.append(current_chunk)
current_chunk = para + "\n"
if current_chunk:
chunks.append(current_chunk)
return chunks
# Prompt for resume parsing
def create_prompt(text_chunk):
return f"""
Extract the following key details from this resume in SHORT key-point format (no long sentences). Return only clean bullet points:
- Name
- Email
- Phone
- Skills (just key skill names or topics)
- Education (just degree, institution, year, no full sentences)
- Experience (just role, company, time period)
- Projects (project title and tech/tools used)
- Certifications (only titles)
CONTENT:
{text_chunk}
Only return bullet points under each section.
"""
# Cleaner to extract only final part
def extract_final_response(raw_output):
name_start = raw_output.lower().find("name:")
if name_start != -1:
return raw_output[name_start:].strip()
return raw_output.strip()
# Convert final text summary to structured JSON
def convert_output_to_json(summary_text):
result = {
"name": "",
"email": "",
"phone": "",
"skills": [],
"education": [],
"experience": [],
"projects": [],
"certifications": []
}
current_section = None
section_patterns = {
"skills": re.compile(r"^skills\b[:\s]*", re.IGNORECASE),
"education": re.compile(r"^education\b[:\s]*", re.IGNORECASE),
"experience": re.compile(r"^experience\b[:\s]*", re.IGNORECASE),
"projects": re.compile(r"^projects\b[:\s]*", re.IGNORECASE),
"certifications": re.compile(r"^certifications\b[:\s]*", re.IGNORECASE),
}
for line in summary_text.splitlines():
line = line.strip()
if not line:
continue
if line.lower().startswith("name:"):
result["name"] = line[5:].strip()
continue
elif line.lower().startswith("email:"):
result["email"] = line[6:].strip()
continue
elif line.lower().startswith("phone:"):
result["phone"] = line[6:].strip()
continue
section_matched = False
for section, pattern in section_patterns.items():
if pattern.match(line.lower()):
current_section = section
section_matched = True
break
if section_matched:
continue
if current_section and line.startswith("- "):
result[current_section].append(line[2:].strip())
return result
# Main inference function
@spaces.GPU(duration=60)
def analyze_document(file, cancel_flag):
ext = os.path.splitext(file.name)[-1].lower()
if ext == ".pdf":
raw_text = extract_text_from_pdf(file)
elif ext == ".docx":
raw_text = extract_text_from_docx(file)
else:
return "❌ Unsupported file format. Please upload a PDF or DOCX.", "❌ Invalid format", {}
if len(raw_text.strip()) == 0:
return "❌ No text found in the document.", "❌ Empty document", {}
chunks = chunk_text(raw_text)
full_summary = ""
tokenizer = AutoTokenizer.from_pretrained(model_id, token=os.environ.get("token"))
model = AutoModelForCausalLM.from_pretrained(
model_id,
device_map="auto",
torch_dtype=torch.float16,
token=os.environ.get("token"),
trust_remote_code=True
)
generator = pipeline("text-generation", model=model, tokenizer=tokenizer)
for chunk in chunks:
if cancel_flag:
return "β›” Analysis cancelled by user.", "β›” Terminated by user", {}
prompt = create_prompt(chunk)
result = generator(prompt, max_new_tokens=1024, do_sample=False)[0]["generated_text"]
cleaned = extract_final_response(result)
full_summary += cleaned + "\n\n---\n\n"
final_text = full_summary.strip()
json_data = convert_output_to_json(final_text)
return final_text, "βœ… Completed", json_data
# Gradio Interface
with gr.Blocks(title="Smart Resume Parser - AI Powered") as demo:
gr.Markdown("## πŸ“„ Resume Parser – Extract Key Info using LLaMA 3 8B")
with gr.Row():
with gr.Column(scale=1):
file_input = gr.File(label="πŸ“Ž Upload Resume (PDF/DOCX)")
with gr.Row():
analyze_button = gr.Button("πŸ” Analyze", variant="primary")
terminate_button = gr.Button("❌ Terminate", variant="stop")
status_box = gr.Textbox(label="πŸ“Š Status", value="⏳ Waiting for input...", interactive=False)
with gr.Column(scale=2):
output_box = gr.Textbox(label="🧠 Extracted Resume Info", lines=30, interactive=False)
json_output = gr.JSON(label="🧾 Resume JSON Output")
cancel_flag = gr.State(False)
analyze_button.click(
fn=analyze_document,
inputs=[file_input, cancel_flag],
outputs=[output_box, status_box, json_output]
)
terminate_button.click(
fn=lambda: gr.update(value=True),
inputs=[],
outputs=[cancel_flag]
)
demo.launch(server_name="0.0.0.0", server_port=7860)