Spaces:

sejalkishan
/

Resume-parser

Runtime error

App Files Files Community

sejalkishan commited on Jul 16, 2025

Commit

2d465b2

verified ·

1 Parent(s): 057c243

Update app.py

Browse files

Files changed (1) hide show

app.py +50 -86

app.py CHANGED Viewed

@@ -7,163 +7,127 @@ import pytesseract
 import torch
 import os
 import spaces
-import json
 import re
-# 🔐 Authenticate Hugging Face Token
 login(token=os.environ.get("token"))
-# ✅ Ensure GPU is available
 if not torch.cuda.is_available():
-    raise RuntimeError("❌ GPU not detected! Please enable GPU in Space settings.")
 print(f"✅ Using GPU: {torch.cuda.get_device_name(0)}")
-# 🧠 Model ID
 model_id = "mistralai/Mistral-7B-Instruct-v0.2"
-# 📄 Extract text from PDF or DOCX
 def extract_text_from_pdf(file):
     text = ""
     with pdfplumber.open(file) as pdf:
         for page in pdf.pages:
-            page_text = page.extract_text()
-            if page_text:
-                text += page_text + "\n"
-            else:
-                img = page.to_image(resolution=300).original
-                text += pytesseract.image_to_string(img) + "\n"
     return text
 def extract_text_from_docx(file):
     doc = docx.Document(file)
     return "\n".join([para.text for para in doc.paragraphs if para.text.strip() != ""])
-# 📦 Chunking long text
 def chunk_text(text, max_chars=6000):
-    paras = text.split("\n")
     chunks, current = [], ""
-    for para in paras:
-        if len(current) + len(para) < max_chars:
-            current += para + "\n"
         else:
             chunks.append(current)
-            current = para + "\n"
     if current:
         chunks.append(current)
     return chunks
-# 🧾 Prompt Template (strict JSON-only)
-def create_resume_prompt(text_chunk):
     return f"""
-You are a resume parsing engine.
-Extract only the following fields from the content below and return them as a valid JSON object.
-Do not include any explanation or formatting — only the JSON.
-{{
-  "name": "",
-  "email": "",
-  "phone": "",
-  "skills": [],
-  "education": "",
-  "experience": []
-}}
 CONTENT:
-{text_chunk}
 """
-# 🧼 Regex JSON extractor
-def clean_to_json(generated):
-    try:
-        match = re.search(r"{[\s\S]+?}", generated)
-        if match:
-            raw_json = match.group()
-            print("🧾 Cleaned JSON block:\n", raw_json)
-            return json.loads(raw_json)
-        else:
-            return {"error": "❌ No JSON object found in model output"}
-    except Exception as e:
-        return {"error": f"❌ JSON parsing failed: {str(e)}"}
-# 🚀 Main Resume Analysis
 @spaces.GPU(duration=60)
 def analyze_resume(file, cancel_flag):
     ext = os.path.splitext(file.name)[-1].lower()
     if ext == ".pdf":
         raw_text = extract_text_from_pdf(file)
     elif ext == ".docx":
         raw_text = extract_text_from_docx(file)
     else:
-        return {"error": "❌ Invalid file format"}, "❌ Upload a valid PDF or DOCX"
     if not raw_text.strip():
-        return {"error": "❌ No text found in resume"}, "❌ Empty file"
     chunks = chunk_text(raw_text)
     tokenizer = AutoTokenizer.from_pretrained(model_id, token=os.environ.get("token"))
     model = AutoModelForCausalLM.from_pretrained(
-        model_id,
-        device_map="auto",
-        torch_dtype=torch.float16,
-        token=os.environ.get("token"),
-        trust_remote_code=True
     )
     generator = pipeline("text-generation", model=model, tokenizer=tokenizer)
-    final_output = {
-        "name": "",
-        "email": "",
-        "phone": "",
-        "skills": [],
-        "education": "",
-        "experience": []
-    }
     for i, chunk in enumerate(chunks):
         if cancel_flag:
-            return {"error": "⛔ Cancelled"}, "⛔ User cancelled"
-        prompt = create_resume_prompt(chunk)
         result = generator(prompt, max_new_tokens=1024, do_sample=False)[0]["generated_text"]
-        print(f"\n\n🔍 Chunk {i+1} Output:\n{result}\n\n")
-        parsed = clean_to_json(result)
-        if isinstance(parsed, dict):
-            for key in final_output.keys():
-                if isinstance(final_output[key], list):
-                    final_output[key].extend(parsed.get(key, []))
-                    final_output[key] = list(set(final_output[key]))  # Remove duplicates
-                elif not final_output[key] and parsed.get(key):
-                    final_output[key] = parsed.get(key)
-    return final_output, "✅ Resume parsed successfully!"
 # 🌐 Gradio UI
-with gr.Blocks(title="Smart Resume Parser - AI Edition") as demo:
-    gr.Markdown("## 📄 Resume Parser – Extract structured info using Mistral 7B (GPU Accelerated)")
     with gr.Row():
         with gr.Column(scale=1):
-            file_input = gr.File(label="📎 Upload Resume (PDF/DOCX)")
             with gr.Row():
                 analyze_btn = gr.Button("🔍 Parse Resume", variant="primary")
                 stop_btn = gr.Button("❌ Cancel", variant="stop")
-            status = gr.Textbox(label="📊 Status", value="⏳ Waiting...", interactive=False)
         with gr.Column(scale=2):
-            json_output = gr.JSON(label="🧠 Extracted Resume Data")
     cancel_flag = gr.State(False)
     analyze_btn.click(
         fn=analyze_resume,
         inputs=[file_input, cancel_flag],
-        outputs=[json_output, status]
     )
     stop_btn.click(

 import torch
 import os
 import spaces
 import re
 login(token=os.environ.get("token"))
 if not torch.cuda.is_available():
+    raise RuntimeError("❌ GPU not detected!")
 print(f"✅ Using GPU: {torch.cuda.get_device_name(0)}")
 model_id = "mistralai/Mistral-7B-Instruct-v0.2"
 def extract_text_from_pdf(file):
     text = ""
     with pdfplumber.open(file) as pdf:
         for page in pdf.pages:
+            text += page.extract_text() or pytesseract.image_to_string(page.to_image().original)
     return text
 def extract_text_from_docx(file):
     doc = docx.Document(file)
     return "\n".join([para.text for para in doc.paragraphs if para.text.strip() != ""])
 def chunk_text(text, max_chars=6000):
     chunks, current = [], ""
+    for line in text.split("\n"):
+        if len(current) + len(line) < max_chars:
+            current += line + "\n"
         else:
             chunks.append(current)
+            current = line + "\n"
     if current:
         chunks.append(current)
     return chunks
+def create_prompt(text):
     return f"""
+Analyze the following resume and extract these key details clearly:
+- Name
+- Email
+- Phone
+- Skills
+- Education
+- Experience
+Format output like this:
+Name: ...
+Email: ...
+Phone: ...
+Skills:
+- ...
+- ...
+Education: ...
+Experience:
+- ...
+- ...
 CONTENT:
+{text}
 """
+def clean_model_output(output):
+    start_index = output.find("Name:")
+    if start_index != -1:
+        return output[start_index:].strip()
+    return output.strip()
 @spaces.GPU(duration=60)
 def analyze_resume(file, cancel_flag):
     ext = os.path.splitext(file.name)[-1].lower()
     if ext == ".pdf":
         raw_text = extract_text_from_pdf(file)
     elif ext == ".docx":
         raw_text = extract_text_from_docx(file)
     else:
+        return "❌ Unsupported file format", "❌ Try PDF or DOCX"
     if not raw_text.strip():
+        return "❌ No text found in the document", "❌ Empty file"
     chunks = chunk_text(raw_text)
     tokenizer = AutoTokenizer.from_pretrained(model_id, token=os.environ.get("token"))
     model = AutoModelForCausalLM.from_pretrained(
+        model_id, device_map="auto", torch_dtype=torch.float16,
+        token=os.environ.get("token"), trust_remote_code=True
     )
     generator = pipeline("text-generation", model=model, tokenizer=tokenizer)
+    final_summary = ""
     for i, chunk in enumerate(chunks):
         if cancel_flag:
+            return "⛔ Analysis cancelled by user.", "⛔ Cancelled"
+        prompt = create_prompt(chunk)
         result = generator(prompt, max_new_tokens=1024, do_sample=False)[0]["generated_text"]
+        print(f"\n🔹 Chunk {i+1} Output:\n{result}\n")
+        final_summary += clean_model_output(result) + "\n\n---\n\n"
+    return final_summary.strip(), "✅ Resume analysis complete"
 # 🌐 Gradio UI
+with gr.Blocks(title="Resume Parser - Key Insight Extractor") as demo:
+    gr.Markdown("## 📄 Resume Analyzer – Extract key information (Name, Email, Skills, etc)")
     with gr.Row():
         with gr.Column(scale=1):
+            file_input = gr.File(label="📎 Upload Resume (PDF or DOCX)")
             with gr.Row():
                 analyze_btn = gr.Button("🔍 Parse Resume", variant="primary")
                 stop_btn = gr.Button("❌ Cancel", variant="stop")
+            status_box = gr.Textbox(label="📊 Status", value="⏳ Waiting...", interactive=False)
         with gr.Column(scale=2):
+            output_text = gr.Textbox(label="🧠 Resume Key Points", lines=30, interactive=False)
     cancel_flag = gr.State(False)
     analyze_btn.click(
         fn=analyze_resume,
         inputs=[file_input, cancel_flag],
+        outputs=[output_text, status_box]
     )
     stop_btn.click(