Spaces:
Runtime error
Runtime error
| import gradio as gr | |
| import pdfplumber | |
| import docx | |
| from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM | |
| from huggingface_hub import login | |
| import pytesseract | |
| import torch | |
| import os | |
| import spaces | |
| import re | |
| # Authenticate Hugging Face token | |
| login(token=os.environ.get("token")) | |
| # Ensure GPU is available | |
| if not torch.cuda.is_available(): | |
| raise RuntimeError("β GPU not detected! Please enable GPU in Space settings.") | |
| print(f"β Using GPU: {torch.cuda.get_device_name(0)}") | |
| # Model | |
| model_id = "meta-llama/Meta-Llama-3-8B-Instruct" | |
| # Document extractors | |
| def extract_text_from_pdf(file): | |
| text = "" | |
| with pdfplumber.open(file) as pdf: | |
| for page in pdf.pages: | |
| page_text = page.extract_text() | |
| if page_text: | |
| text += page_text + "\n" | |
| else: | |
| img = page.to_image(resolution=300).original | |
| ocr_text = pytesseract.image_to_string(img) | |
| text += ocr_text + "\n" | |
| return text | |
| def extract_text_from_docx(file): | |
| doc = docx.Document(file) | |
| return "\n".join([para.text for para in doc.paragraphs if para.text.strip() != ""]) | |
| def chunk_text(text, max_chars=6000): | |
| paragraphs = text.split("\n") | |
| chunks, current_chunk = [], "" | |
| for para in paragraphs: | |
| if len(current_chunk) + len(para) < max_chars: | |
| current_chunk += para + "\n" | |
| else: | |
| chunks.append(current_chunk) | |
| current_chunk = para + "\n" | |
| if current_chunk: | |
| chunks.append(current_chunk) | |
| return chunks | |
| # Prompt for resume parsing | |
| def create_prompt(text_chunk): | |
| return f""" | |
| Extract the following key details from this resume in SHORT key-point format (no long sentences). Return only clean bullet points: | |
| - Name | |
| - Phone | |
| - Skills (just key skill names or topics) | |
| - Education (just degree, institution, year, no full sentences) | |
| - Experience (just role, company, time period) | |
| - Projects (project title and tech/tools used) | |
| - Certifications (only titles) | |
| CONTENT: | |
| {text_chunk} | |
| Only return bullet points under each section. | |
| """ | |
| # Cleaner to extract only final part | |
| def extract_final_response(raw_output): | |
| name_start = raw_output.lower().find("name:") | |
| if name_start != -1: | |
| return raw_output[name_start:].strip() | |
| return raw_output.strip() | |
| # Convert final text summary to structured JSON | |
| def convert_output_to_json(summary_text): | |
| result = { | |
| "name": "", | |
| "email": "", | |
| "phone": "", | |
| "skills": [], | |
| "education": [], | |
| "experience": [], | |
| "projects": [], | |
| "certifications": [] | |
| } | |
| current_section = None | |
| section_patterns = { | |
| "skills": re.compile(r"^skills\b[:\s]*", re.IGNORECASE), | |
| "education": re.compile(r"^education\b[:\s]*", re.IGNORECASE), | |
| "experience": re.compile(r"^experience\b[:\s]*", re.IGNORECASE), | |
| "projects": re.compile(r"^projects\b[:\s]*", re.IGNORECASE), | |
| "certifications": re.compile(r"^certifications\b[:\s]*", re.IGNORECASE), | |
| } | |
| for line in summary_text.splitlines(): | |
| line = line.strip() | |
| if not line: | |
| continue | |
| if line.lower().startswith("name:"): | |
| result["name"] = line[5:].strip() | |
| continue | |
| elif line.lower().startswith("email:"): | |
| result["email"] = line[6:].strip() | |
| continue | |
| elif line.lower().startswith("phone:"): | |
| result["phone"] = line[6:].strip() | |
| continue | |
| section_matched = False | |
| for section, pattern in section_patterns.items(): | |
| if pattern.match(line.lower()): | |
| current_section = section | |
| section_matched = True | |
| break | |
| if section_matched: | |
| continue | |
| if current_section and line.startswith("- "): | |
| result[current_section].append(line[2:].strip()) | |
| return result | |
| # Main inference function | |
| def analyze_document(file, cancel_flag): | |
| ext = os.path.splitext(file.name)[-1].lower() | |
| if ext == ".pdf": | |
| raw_text = extract_text_from_pdf(file) | |
| elif ext == ".docx": | |
| raw_text = extract_text_from_docx(file) | |
| else: | |
| return "β Unsupported file format. Please upload a PDF or DOCX.", "β Invalid format", {} | |
| if len(raw_text.strip()) == 0: | |
| return "β No text found in the document.", "β Empty document", {} | |
| chunks = chunk_text(raw_text) | |
| full_summary = "" | |
| tokenizer = AutoTokenizer.from_pretrained(model_id, token=os.environ.get("token")) | |
| model = AutoModelForCausalLM.from_pretrained( | |
| model_id, | |
| device_map="auto", | |
| torch_dtype=torch.float16, | |
| token=os.environ.get("token"), | |
| trust_remote_code=True | |
| ) | |
| generator = pipeline("text-generation", model=model, tokenizer=tokenizer) | |
| for chunk in chunks: | |
| if cancel_flag: | |
| return "β Analysis cancelled by user.", "β Terminated by user", {} | |
| prompt = create_prompt(chunk) | |
| result = generator(prompt, max_new_tokens=1024, do_sample=False)[0]["generated_text"] | |
| cleaned = extract_final_response(result) | |
| full_summary += cleaned + "\n\n---\n\n" | |
| final_text = full_summary.strip() | |
| json_data = convert_output_to_json(final_text) | |
| return final_text, "β Completed", json_data | |
| # Gradio Interface | |
| with gr.Blocks(title="Smart Resume Parser - AI Powered") as demo: | |
| gr.Markdown("## π Resume Parser β Extract Key Info using LLaMA 3 8B") | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| file_input = gr.File(label="π Upload Resume (PDF/DOCX)") | |
| with gr.Row(): | |
| analyze_button = gr.Button("π Analyze", variant="primary") | |
| terminate_button = gr.Button("β Terminate", variant="stop") | |
| status_box = gr.Textbox(label="π Status", value="β³ Waiting for input...", interactive=False) | |
| with gr.Column(scale=2): | |
| output_box = gr.Textbox(label="π§ Extracted Resume Info", lines=30, interactive=False) | |
| json_output = gr.JSON(label="π§Ύ Resume JSON Output") | |
| cancel_flag = gr.State(False) | |
| analyze_button.click( | |
| fn=analyze_document, | |
| inputs=[file_input, cancel_flag], | |
| outputs=[output_box, status_box, json_output] | |
| ) | |
| terminate_button.click( | |
| fn=lambda: gr.update(value=True), | |
| inputs=[], | |
| outputs=[cancel_flag] | |
| ) | |
| demo.launch(server_name="0.0.0.0", server_port=7860) | |