Spaces:

sejalkishan
/

resume-parser-2

Runtime error

App Files Files Community

resume-parser-2 / app.py

sejalkishan

Update app.py

5325f14 verified 9 months ago

raw

history blame contribute delete

6.52 kB

	import gradio as gr
	import pdfplumber
	import docx
	from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
	from huggingface_hub import login
	import pytesseract
	import torch
	import os
	import spaces
	import re

	# Authenticate Hugging Face token
	login(token=os.environ.get("token"))

	# Ensure GPU is available
	if not torch.cuda.is_available():
	raise RuntimeError("❌ GPU not detected! Please enable GPU in Space settings.")
	print(f"✅ Using GPU: {torch.cuda.get_device_name(0)}")

	# Model
	model_id = "meta-llama/Meta-Llama-3-8B-Instruct"

	# Document extractors
	def extract_text_from_pdf(file):
	text = ""
	with pdfplumber.open(file) as pdf:
	for page in pdf.pages:
	page_text = page.extract_text()
	if page_text:
	text += page_text + "\n"
	else:
	img = page.to_image(resolution=300).original
	ocr_text = pytesseract.image_to_string(img)
	text += ocr_text + "\n"
	return text

	def extract_text_from_docx(file):
	doc = docx.Document(file)
	return "\n".join([para.text for para in doc.paragraphs if para.text.strip() != ""])

	def chunk_text(text, max_chars=6000):
	paragraphs = text.split("\n")
	chunks, current_chunk = [], ""
	for para in paragraphs:
	if len(current_chunk) + len(para) < max_chars:
	current_chunk += para + "\n"
	else:
	chunks.append(current_chunk)
	current_chunk = para + "\n"
	if current_chunk:
	chunks.append(current_chunk)
	return chunks

	# Prompt for resume parsing
	def create_prompt(text_chunk):
	return f"""
	Extract the following key details from this resume in SHORT key-point format (no long sentences). Return only clean bullet points:
	- Name
	- Email
	- Phone
	- Skills (just key skill names or topics)
	- Education (just degree, institution, year, no full sentences)
	- Experience (just role, company, time period)
	- Projects (project title and tech/tools used)
	- Certifications (only titles)

	CONTENT:
	{text_chunk}
	Only return bullet points under each section.
	"""

	# Cleaner to extract only final part
	def extract_final_response(raw_output):
	name_start = raw_output.lower().find("name:")
	if name_start != -1:
	return raw_output[name_start:].strip()
	return raw_output.strip()

	# Convert final text summary to structured JSON
	def convert_output_to_json(summary_text):
	result = {
	"name": "",
	"email": "",
	"phone": "",
	"skills": [],
	"education": [],
	"experience": [],
	"projects": [],
	"certifications": []
	}

	current_section = None

	section_patterns = {
	"skills": re.compile(r"^skills\b[:\s]*", re.IGNORECASE),
	"education": re.compile(r"^education\b[:\s]*", re.IGNORECASE),
	"experience": re.compile(r"^experience\b[:\s]*", re.IGNORECASE),
	"projects": re.compile(r"^projects\b[:\s]*", re.IGNORECASE),
	"certifications": re.compile(r"^certifications\b[:\s]*", re.IGNORECASE),
	}

	for line in summary_text.splitlines():
	line = line.strip()
	if not line:
	continue

	if line.lower().startswith("name:"):
	result["name"] = line[5:].strip()
	continue
	elif line.lower().startswith("email:"):
	result["email"] = line[6:].strip()
	continue
	elif line.lower().startswith("phone:"):
	result["phone"] = line[6:].strip()
	continue

	section_matched = False
	for section, pattern in section_patterns.items():
	if pattern.match(line.lower()):
	current_section = section
	section_matched = True
	break

	if section_matched:
	continue

	if current_section and line.startswith("- "):
	result[current_section].append(line[2:].strip())

	return result

	# Main inference function
	@spaces.GPU(duration=60)
	def analyze_document(file, cancel_flag):
	ext = os.path.splitext(file.name)[-1].lower()

	if ext == ".pdf":
	raw_text = extract_text_from_pdf(file)
	elif ext == ".docx":
	raw_text = extract_text_from_docx(file)
	else:
	return "❌ Unsupported file format. Please upload a PDF or DOCX.", "❌ Invalid format", {}

	if len(raw_text.strip()) == 0:
	return "❌ No text found in the document.", "❌ Empty document", {}

	chunks = chunk_text(raw_text)
	full_summary = ""

	tokenizer = AutoTokenizer.from_pretrained(model_id, token=os.environ.get("token"))
	model = AutoModelForCausalLM.from_pretrained(
	model_id,
	device_map="auto",
	torch_dtype=torch.float16,
	token=os.environ.get("token"),
	trust_remote_code=True
	)
	generator = pipeline("text-generation", model=model, tokenizer=tokenizer)

	for chunk in chunks:
	if cancel_flag:
	return "⛔ Analysis cancelled by user.", "⛔ Terminated by user", {}

	prompt = create_prompt(chunk)
	result = generator(prompt, max_new_tokens=1024, do_sample=False)[0]["generated_text"]
	cleaned = extract_final_response(result)
	full_summary += cleaned + "\n\n---\n\n"

	final_text = full_summary.strip()
	json_data = convert_output_to_json(final_text)

	return final_text, "✅ Completed", json_data

	# Gradio Interface
	with gr.Blocks(title="Smart Resume Parser - AI Powered") as demo:
	gr.Markdown("## 📄 Resume Parser – Extract Key Info using LLaMA 3 8B")

	with gr.Row():
	with gr.Column(scale=1):
	file_input = gr.File(label="📎 Upload Resume (PDF/DOCX)")
	with gr.Row():
	analyze_button = gr.Button("🔍 Analyze", variant="primary")
	terminate_button = gr.Button("❌ Terminate", variant="stop")
	status_box = gr.Textbox(label="📊 Status", value="⏳ Waiting for input...", interactive=False)

	with gr.Column(scale=2):
	output_box = gr.Textbox(label="🧠 Extracted Resume Info", lines=30, interactive=False)
	json_output = gr.JSON(label="🧾 Resume JSON Output")

	cancel_flag = gr.State(False)

	analyze_button.click(
	fn=analyze_document,
	inputs=[file_input, cancel_flag],
	outputs=[output_box, status_box, json_output]
	)

	terminate_button.click(
	fn=lambda: gr.update(value=True),
	inputs=[],
	outputs=[cancel_flag]
	)

	demo.launch(server_name="0.0.0.0", server_port=7860)