import gradio as gr import pdfplumber import docx import os import pytesseract from PIL import Image from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline import torch from huggingface_hub import login import spaces # ๐Ÿ” Login (Hugging Face token should be set as HF_TOKEN env variable) login(token=os.environ.get("token")) # โœ… Check for GPU if not torch.cuda.is_available(): raise RuntimeError("โŒ GPU not detected! Please enable GPU in Space settings.") print(f"โœ… Using GPU: {torch.cuda.get_device_name(0)}") # Model setup model_id = "mistralai/Mistral-7B-Instruct-v0.2" tokenizer = AutoTokenizer.from_pretrained(model_id, token=os.environ.get("token")) model = AutoModelForCausalLM.from_pretrained( model_id, device_map="auto", torch_dtype=torch.float16, token=os.environ.get("token"), trust_remote_code=True ) generator = pipeline("text-generation", model=model, tokenizer=tokenizer) # ๐Ÿ“„ Extractors def extract_text_from_pdf(file): text = "" with pdfplumber.open(file) as pdf: for page in pdf.pages: page_text = page.extract_text() if page_text: text += page_text + "\n" else: img = page.to_image(resolution=300).original ocr_text = pytesseract.image_to_string(img) text += ocr_text + "\n" return text def extract_text_from_docx(file): doc = docx.Document(file) return "\n".join([para.text for para in doc.paragraphs if para.text.strip() != ""]) def chunk_text(text, max_chars=6000): paragraphs = text.split("\n") chunks, current_chunk = [], "" for para in paragraphs: if len(current_chunk) + len(para) < max_chars: current_chunk += para + "\n" else: chunks.append(current_chunk) current_chunk = para + "\n" if current_chunk: chunks.append(current_chunk) return chunks # ๐Ÿงพ Prompt to return only key points def create_prompt(text_chunk): return f""" Extract the following key details from this resume in SHORT key-point format (no long sentences). Return only clean bullet points: - Name - Email - Phone - Skills (just key skill names or topics) - Education (just degree, institution, year, no full sentences) - Experience (just role, company, time period) - Projects (project title and tech/tools used) - Certifications (only titles) CONTENT: {text_chunk} Only return bullet points under each section. """ def clean_output(raw_output): start_marker = "Name:" if start_marker in raw_output: return raw_output[raw_output.index(start_marker):].strip() return raw_output.strip() # ๐Ÿš€ Main function @spaces.GPU(duration=60) def analyze_document(file, cancel_flag): ext = os.path.splitext(file.name)[-1].lower() if ext == ".pdf": raw_text = extract_text_from_pdf(file) elif ext == ".docx": raw_text = extract_text_from_docx(file) else: return "โŒ Unsupported file format", "โŒ Invalid format" if not raw_text.strip(): return "โŒ No text found in document", "โŒ Empty" chunks = chunk_text(raw_text) full_summary = "" for i, chunk in enumerate(chunks): if cancel_flag: return "โ›” Cancelled", "โ›”" prompt = create_prompt(chunk) result = generator(prompt, max_new_tokens=1024, do_sample=False)[0]["generated_text"] cleaned = clean_output(result) full_summary += cleaned + "\n\n---\n\n" return full_summary.strip(), "โœ… Completed" # ๐ŸŒ Interface with gr.Blocks(title="Smart Resume Parser - Key Points Edition") as demo: gr.Markdown("## ๐Ÿ“„ Resume Parser โ€“ Summarized Key Points from PDF/DOCX") with gr.Row(): with gr.Column(scale=1): file_input = gr.File(label="๐Ÿ“Ž Upload Resume (PDF/DOCX)") with gr.Row(): analyze_button = gr.Button("๐Ÿ” Parse", variant="primary") terminate_button = gr.Button("โŒ Cancel", variant="stop") status_box = gr.Textbox(label="๐Ÿ“Š Status", value="โณ Waiting...", interactive=False) with gr.Column(scale=2): output_box = gr.Textbox(label="๐Ÿง  Resume Key Highlights", lines=30, interactive=False) cancel_flag = gr.State(False) analyze_button.click( fn=analyze_document, inputs=[file_input, cancel_flag], outputs=[output_box, status_box] ) terminate_button.click( fn=lambda: gr.update(value=True), inputs=[], outputs=[cancel_flag] ) demo.launch(server_name="0.0.0.0", server_port=7860)