Spaces:

sejalkishan
/

Resume-parser

Runtime error

App Files Files Community

sejalkishan commited on Jul 16, 2025

Commit

321a3c2

verified ·

1 Parent(s): 456a1db

Update app.py

Browse files

Files changed (1) hide show

app.py +127 -67

app.py CHANGED Viewed

@@ -1,91 +1,151 @@
 import gradio as gr
-import fitz  # PyMuPDF
-from fastapi import FastAPI
 import docx
-import re
-# 📄 Extract text from PDF file
 def extract_text_from_pdf(file):
-    doc = fitz.open(stream=file.read(), filetype="pdf")
     text = ""
-    for page in doc:
-        text += page.get_text()
     return text
-# 📄 Extract text from DOCX file
 def extract_text_from_docx(file):
     doc = docx.Document(file)
-    text = "\n".join([para.text for para in doc.paragraphs])
-    return text
-# 🧠 Extract structured info from text
-def extract_info(text):
-    data = {}
-    name_match = re.search(r"(?i)Name[:\-]?\s*(.+)", text)
-    email_match = re.search(r"[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+", text)
-    phone_match = re.search(r"(\+91[- ]?)?\d{10}", text)
-    data["name"] = name_match.group(1).strip() if name_match else ""
-    data["email"] = email_match.group(0) if email_match else ""
-    data["phone"] = phone_match.group(0) if phone_match else ""
-    # 🛠️ Simple keyword match for skills
-    skill_keywords = ["Python", "Java", "C++", "NLP", "Machine Learning", "Data Science", "SQL", "React"]
-    found_skills = [skill for skill in skill_keywords if skill.lower() in text.lower()]
-    data["skills"] = found_skills
-    return data
-# 🎯 Main function to process uploaded resume
-def process_resume(file):
-    if file.name.endswith(".pdf"):
-        text = extract_text_from_pdf(file)
-    elif file.name.endswith(".docx"):
-        text = extract_text_from_docx(file)
     else:
-        return {"error": "Unsupported file format"}, "❌ File format not supported"
-    extracted_data = extract_info(text)
-    return extracted_data, "✅ Resume processed successfully!"
-# 🎨 Gradio UI Layout
-with gr.Blocks(title="Smart Resume Parser", css="body { max-width: 1100px; margin: auto; }") as demo:
-    gr.Markdown("## 📄 Smart Resume Parser – Extract structured info from PDF/DOCX")
     with gr.Row():
         with gr.Column(scale=1):
-            file_input = gr.File(label="📎 Upload Resume (PDF or DOCX)")
             with gr.Row():
-                analyze_button = gr.Button("🔍 Parse Resume", variant="primary")
-                reset_button = gr.Button("♻️ Reset", variant="stop")
-            status_box = gr.Textbox(label="📊 Status", value="⏳ Waiting for input...", interactive=False)
         with gr.Column(scale=2):
-            output_json = gr.JSON(label="🧠 Extracted Resume Data")
-    # 🔘 Button Actions
-    analyze_button.click(
-        fn=process_resume,
-        inputs=[file_input],
-        outputs=[output_json, status_box]
     )
-    reset_button.click(
-        fn=lambda: (None, "⏳ Waiting for input..."),
         inputs=[],
-        outputs=[output_json, status_box]
     )
-# 🚀 Mount Gradio to FastAPI for Hugging Face Spaces
-app = gr.mount_gradio_app(app=FastAPI(), blocks=demo, path="/")
-# 🧪 Local Dev Testing
-if __name__ == "__main__":
-    import uvicorn
-    uvicorn.run("app:app", host="0.0.0.0", port=7860)
-# ✅ Hugging Face Compatibility Fix
-import sys
-if __name__ != "__main__":
-    sys.modules["app"] = sys.modules[__name__]

 import gradio as gr
+import pdfplumber
 import docx
+from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
+from huggingface_hub import login
+import pytesseract
+import torch
+import os
+import spaces
+# 🔐 Authenticate
+login(token=os.environ.get("token"))
+# ✅ GPU Check
+if not torch.cuda.is_available():
+    raise RuntimeError("❌ GPU not detected! Please enable GPU in Space settings.")
+print(f"✅ Using GPU: {torch.cuda.get_device_name(0)}")
+# 🧠 Model
+model_id = "mistralai/Mistral-7B-Instruct-v0.2"
+# 📄 Extractors
 def extract_text_from_pdf(file):
     text = ""
+    with pdfplumber.open(file) as pdf:
+        for page in pdf.pages:
+            page_text = page.extract_text()
+            if page_text:
+                text += page_text + "\n"
+            else:
+                img = page.to_image(resolution=300).original
+                text += pytesseract.image_to_string(img) + "\n"
     return text
 def extract_text_from_docx(file):
     doc = docx.Document(file)
+    return "\n".join([para.text for para in doc.paragraphs if para.text.strip() != ""])
+def chunk_text(text, max_chars=6000):
+    paras = text.split("\n")
+    chunks, current = [], ""
+    for para in paras:
+        if len(current) + len(para) < max_chars:
+            current += para + "\n"
+        else:
+            chunks.append(current)
+            current = para + "\n"
+    if current:
+        chunks.append(current)
+    return chunks
+# 🧾 Resume Prompt
+def create_resume_prompt(text_chunk):
+    return f"""
+You are an AI assistant trained to parse resumes. Extract the following information in JSON format based on the content below.
+Return only valid JSON like this example:
+{{
+  "name": "John Doe",
+  "email": "john@example.com",
+  "phone": "+1-1234567890",
+  "skills": ["Python", "Java", "Machine Learning"],
+  "education": "B.Tech in Computer Science from MIT",
+  "experience": "3 years as Software Engineer at Google"
+}}
+CONTENT:
+{text_chunk}
+"""
+# 🧼 Clean JSON output
+def clean_to_json(generated):
+    try:
+        start = generated.index('{')
+        end = generated.rindex('}') + 1
+        return generated[start:end]
+    except:
+        return '{"error": "❌ Failed to extract JSON from model output"}'
+# 🚀 Main Resume Analyzer
+@spaces.GPU(duration=60)
+def analyze_resume(file, cancel_flag):
+    ext = os.path.splitext(file.name)[-1].lower()
+    if ext == ".pdf":
+        raw_text = extract_text_from_pdf(file)
+    elif ext == ".docx":
+        raw_text = extract_text_from_docx(file)
     else:
+        return {"error": "❌ Invalid format"}, "❌ Please upload a valid PDF or DOCX file."
+    if not raw_text.strip():
+        return {"error": "❌ No text found"}, "❌ Empty resume"
+    chunks = chunk_text(raw_text)
+    full_json = {}
+    tokenizer = AutoTokenizer.from_pretrained(model_id, token=os.environ.get("token"))
+    model = AutoModelForCausalLM.from_pretrained(
+        model_id,
+        device_map="auto",
+        torch_dtype=torch.float16,
+        token=os.environ.get("token"),
+        trust_remote_code=True
+    )
+    generator = pipeline("text-generation", model=model, tokenizer=tokenizer)
+    for i, chunk in enumerate(chunks):
+        if cancel_flag:
+            return {"error": "⛔ Terminated by user"}, "⛔ Analysis cancelled"
+        prompt = create_resume_prompt(chunk)
+        result = generator(prompt, max_new_tokens=1024, do_sample=False)[0]["generated_text"]
+        cleaned = clean_to_json(result)
+        try:
+            chunk_json = eval(cleaned) if isinstance(cleaned, str) else cleaned
+            full_json.update(chunk_json)
+        except:
+            continue
+    return full_json, "✅ Resume parsed successfully!"
+# 🌐 Gradio UI
+with gr.Blocks(title="Smart Resume Parser - AI Edition") as demo:
+    gr.Markdown("## 📄 Resume Parser – Extract structured info using Mistral 7B (GPU Accelerated)")
     with gr.Row():
         with gr.Column(scale=1):
+            file_input = gr.File(label="📎 Upload Resume (PDF/DOCX)")
             with gr.Row():
+                analyze_btn = gr.Button("🔍 Parse Resume", variant="primary")
+                stop_btn = gr.Button("❌ Cancel", variant="stop")
+            status = gr.Textbox(label="📊 Status", value="⏳ Waiting...", interactive=False)
         with gr.Column(scale=2):
+            json_output = gr.JSON(label="🧠 Extracted Resume Data")
+    cancel_flag = gr.State(False)
+    analyze_btn.click(
+        fn=analyze_resume,
+        inputs=[file_input, cancel_flag],
+        outputs=[json_output, status]
     )
+    stop_btn.click(
+        fn=lambda: gr.update(value=True),
         inputs=[],
+        outputs=[cancel_flag]
     )
+demo.launch(server_name="0.0.0.0", server_port=7860, share=True)