Spaces:

sejalkishan
/

Resume-parser

Runtime error

File size: 4,624 Bytes

import gradio as gr
import pdfplumber
import docx
import os
import pytesseract
from PIL import Image
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import torch
from huggingface_hub import login
import spaces

# 🔐 Login (Hugging Face token should be set as HF_TOKEN env variable)
login(token=os.environ.get("token"))

# ✅ Check for GPU
if not torch.cuda.is_available():
    raise RuntimeError("❌ GPU not detected! Please enable GPU in Space settings.")
print(f"✅ Using GPU: {torch.cuda.get_device_name(0)}")

# Model setup
model_id = "mistralai/Mistral-7B-Instruct-v0.2"
tokenizer = AutoTokenizer.from_pretrained(model_id, token=os.environ.get("token"))
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto",
    torch_dtype=torch.float16,
    token=os.environ.get("token"),
    trust_remote_code=True
)
generator = pipeline("text-generation", model=model, tokenizer=tokenizer)

# 📄 Extractors
def extract_text_from_pdf(file):
    text = ""
    with pdfplumber.open(file) as pdf:
        for page in pdf.pages:
            page_text = page.extract_text()
            if page_text:
                text += page_text + "\n"
            else:
                img = page.to_image(resolution=300).original
                ocr_text = pytesseract.image_to_string(img)
                text += ocr_text + "\n"
    return text

def extract_text_from_docx(file):
    doc = docx.Document(file)
    return "\n".join([para.text for para in doc.paragraphs if para.text.strip() != ""])

def chunk_text(text, max_chars=6000):
    paragraphs = text.split("\n")
    chunks, current_chunk = [], ""
    for para in paragraphs:
        if len(current_chunk) + len(para) < max_chars:
            current_chunk += para + "\n"
        else:
            chunks.append(current_chunk)
            current_chunk = para + "\n"
    if current_chunk:
        chunks.append(current_chunk)
    return chunks

# 🧾 Prompt to return only key points
def create_prompt(text_chunk):
    return f"""
Extract the following key details from this resume in SHORT key-point format (no long sentences). Return only clean bullet points:

- Name
- Email
- Phone
- Skills (just key skill names or topics)
- Education (just degree, institution, year, no full sentences)
- Experience (just role, company, time period)
- Projects (project title and tech/tools used)
- Certifications (only titles)

CONTENT:
{text_chunk}

Only return bullet points under each section.
"""

def clean_output(raw_output):
    start_marker = "Name:"
    if start_marker in raw_output:
        return raw_output[raw_output.index(start_marker):].strip()
    return raw_output.strip()

# 🚀 Main function
@spaces.GPU(duration=60)
def analyze_document(file, cancel_flag):
    ext = os.path.splitext(file.name)[-1].lower()
    if ext == ".pdf":
        raw_text = extract_text_from_pdf(file)
    elif ext == ".docx":
        raw_text = extract_text_from_docx(file)
    else:
        return "❌ Unsupported file format", "❌ Invalid format"

    if not raw_text.strip():
        return "❌ No text found in document", "❌ Empty"

    chunks = chunk_text(raw_text)
    full_summary = ""
    for i, chunk in enumerate(chunks):
        if cancel_flag:
            return "⛔ Cancelled", "⛔"
        prompt = create_prompt(chunk)
        result = generator(prompt, max_new_tokens=1024, do_sample=False)[0]["generated_text"]
        cleaned = clean_output(result)
        full_summary += cleaned + "\n\n---\n\n"
    return full_summary.strip(), "✅ Completed"

# 🌐 Interface
with gr.Blocks(title="Smart Resume Parser - Key Points Edition") as demo:
    gr.Markdown("## 📄 Resume Parser – Summarized Key Points from PDF/DOCX")

    with gr.Row():
        with gr.Column(scale=1):
            file_input = gr.File(label="📎 Upload Resume (PDF/DOCX)")
            with gr.Row():
                analyze_button = gr.Button("🔍 Parse", variant="primary")
                terminate_button = gr.Button("❌ Cancel", variant="stop")
            status_box = gr.Textbox(label="📊 Status", value="⏳ Waiting...", interactive=False)

        with gr.Column(scale=2):
            output_box = gr.Textbox(label="🧠 Resume Key Highlights", lines=30, interactive=False)

    cancel_flag = gr.State(False)

    analyze_button.click(
        fn=analyze_document,
        inputs=[file_input, cancel_flag],
        outputs=[output_box, status_box]
    )

    terminate_button.click(
        fn=lambda: gr.update(value=True),
        inputs=[],
        outputs=[cancel_flag]
    )

demo.launch(server_name="0.0.0.0", server_port=7860)