Spaces:

sejalkishan
/

Resume-parser

Runtime error

App Files Files Community

sejalkishan commited on Jul 16, 2025

Commit

260a5ff

verified ·

1 Parent(s): 2d465b2

Update app.py

Browse files

Files changed (1) hide show

app.py +67 -63

app.py CHANGED Viewed

@@ -6,22 +6,31 @@ from huggingface_hub import login
 import pytesseract
 import torch
 import os
-import spaces
 import re
 login(token=os.environ.get("token"))
 if not torch.cuda.is_available():
-    raise RuntimeError("❌ GPU not detected!")
 print(f"✅ Using GPU: {torch.cuda.get_device_name(0)}")
 model_id = "mistralai/Mistral-7B-Instruct-v0.2"
 def extract_text_from_pdf(file):
     text = ""
     with pdfplumber.open(file) as pdf:
         for page in pdf.pages:
-            text += page.extract_text() or pytesseract.image_to_string(page.to_image().original)
     return text
 def extract_text_from_docx(file):
@@ -29,111 +38,106 @@ def extract_text_from_docx(file):
     return "\n".join([para.text for para in doc.paragraphs if para.text.strip() != ""])
 def chunk_text(text, max_chars=6000):
-    chunks, current = [], ""
-    for line in text.split("\n"):
-        if len(current) + len(line) < max_chars:
-            current += line + "\n"
         else:
-            chunks.append(current)
-            current = line + "\n"
-    if current:
-        chunks.append(current)
     return chunks
-def create_prompt(text):
     return f"""
-Analyze the following resume and extract these key details clearly:
-- Name
-- Email
-- Phone
-- Skills
-- Education
-- Experience
-Format output like this:
-Name: ...
-Email: ...
-Phone: ...
-Skills:
-- ...
-- ...
-Education: ...
-Experience:
-- ...
-- ...
 CONTENT:
-{text}
 """
-def clean_model_output(output):
-    start_index = output.find("Name:")
-    if start_index != -1:
-        return output[start_index:].strip()
-    return output.strip()
 @spaces.GPU(duration=60)
-def analyze_resume(file, cancel_flag):
     ext = os.path.splitext(file.name)[-1].lower()
     if ext == ".pdf":
         raw_text = extract_text_from_pdf(file)
     elif ext == ".docx":
         raw_text = extract_text_from_docx(file)
     else:
-        return "❌ Unsupported file format", "❌ Try PDF or DOCX"
-    if not raw_text.strip():
-        return "❌ No text found in the document", "❌ Empty file"
     chunks = chunk_text(raw_text)
     tokenizer = AutoTokenizer.from_pretrained(model_id, token=os.environ.get("token"))
     model = AutoModelForCausalLM.from_pretrained(
-        model_id, device_map="auto", torch_dtype=torch.float16,
-        token=os.environ.get("token"), trust_remote_code=True
     )
     generator = pipeline("text-generation", model=model, tokenizer=tokenizer)
-    final_summary = ""
     for i, chunk in enumerate(chunks):
         if cancel_flag:
-            return "⛔ Analysis cancelled by user.", "⛔ Cancelled"
         prompt = create_prompt(chunk)
         result = generator(prompt, max_new_tokens=1024, do_sample=False)[0]["generated_text"]
-        print(f"\n🔹 Chunk {i+1} Output:\n{result}\n")
-        final_summary += clean_model_output(result) + "\n\n---\n\n"
-    return final_summary.strip(), "✅ Resume analysis complete"
-# 🌐 Gradio UI
-with gr.Blocks(title="Resume Parser - Key Insight Extractor") as demo:
-    gr.Markdown("## 📄 Resume Analyzer – Extract key information (Name, Email, Skills, etc)")
     with gr.Row():
         with gr.Column(scale=1):
-            file_input = gr.File(label="📎 Upload Resume (PDF or DOCX)")
             with gr.Row():
-                analyze_btn = gr.Button("🔍 Parse Resume", variant="primary")
-                stop_btn = gr.Button("❌ Cancel", variant="stop")
-            status_box = gr.Textbox(label="📊 Status", value="⏳ Waiting...", interactive=False)
         with gr.Column(scale=2):
-            output_text = gr.Textbox(label="🧠 Resume Key Points", lines=30, interactive=False)
     cancel_flag = gr.State(False)
-    analyze_btn.click(
-        fn=analyze_resume,
         inputs=[file_input, cancel_flag],
-        outputs=[output_text, status_box]
     )
-    stop_btn.click(
         fn=lambda: gr.update(value=True),
         inputs=[],
         outputs=[cancel_flag]
     )
-demo.launch(server_name="0.0.0.0", server_port=7860, share=True)

 import pytesseract
 import torch
 import os
 import re
+import spaces
+# 🔐 Authenticate Hugging Face token
 login(token=os.environ.get("token"))
+# ✅ Ensure GPU is available
 if not torch.cuda.is_available():
+    raise RuntimeError("❌ GPU not detected! Please enable GPU in Space settings.")
 print(f"✅ Using GPU: {torch.cuda.get_device_name(0)}")
+# 🧠 Model
 model_id = "mistralai/Mistral-7B-Instruct-v0.2"
 def extract_text_from_pdf(file):
     text = ""
     with pdfplumber.open(file) as pdf:
         for page in pdf.pages:
+            page_text = page.extract_text()
+            if page_text:
+                text += page_text + "\n"
+            else:
+                img = page.to_image(resolution=300).original
+                ocr_text = pytesseract.image_to_string(img)
+                text += ocr_text + "\n"
     return text
 def extract_text_from_docx(file):
     return "\n".join([para.text for para in doc.paragraphs if para.text.strip() != ""])
 def chunk_text(text, max_chars=6000):
+    paragraphs = text.split("\n")
+    chunks, current_chunk = [], ""
+    for para in paragraphs:
+        if len(current_chunk) + len(para) < max_chars:
+            current_chunk += para + "\n"
         else:
+            chunks.append(current_chunk)
+            current_chunk = para + "\n"
+    if current_chunk:
+        chunks.append(current_chunk)
     return chunks
+def create_prompt(text_chunk):
     return f"""
+Analyze the following resume and extract ONLY the following fields in clean text format with clear labels:
+Name
+Email
+Phone
+Skills (bullet points)
+Education (bullet points)
+Experience (bullet points with org, role, period)
+Projects (bullet points with brief descriptions)
+Return only these details and nothing else.
 CONTENT:
+{text_chunk}
 """
+def extract_final_response(raw_output):
+    matches = list(re.finditer(r"\\bName\\s*:", raw_output))
+    if len(matches) >= 2:
+        return raw_output[matches[1].start():].strip()
+    return raw_output.strip()
 @spaces.GPU(duration=60)
+def analyze_document(file, cancel_flag):
     ext = os.path.splitext(file.name)[-1].lower()
     if ext == ".pdf":
         raw_text = extract_text_from_pdf(file)
     elif ext == ".docx":
         raw_text = extract_text_from_docx(file)
     else:
+        return "❌ Unsupported file format. Please upload a PDF or DOCX.", "❌ Invalid format"
+    if len(raw_text.strip()) == 0:
+        return "❌ No text found in the document.", "❌ Empty document"
     chunks = chunk_text(raw_text)
+    full_summary = ""
     tokenizer = AutoTokenizer.from_pretrained(model_id, token=os.environ.get("token"))
     model = AutoModelForCausalLM.from_pretrained(
+        model_id,
+        device_map="auto",
+        torch_dtype=torch.float16,
+        token=os.environ.get("token"),
+        trust_remote_code=True
     )
     generator = pipeline("text-generation", model=model, tokenizer=tokenizer)
     for i, chunk in enumerate(chunks):
         if cancel_flag:
+            return "⛔ Analysis cancelled by user.", "⛔ Terminated by user"
         prompt = create_prompt(chunk)
         result = generator(prompt, max_new_tokens=1024, do_sample=False)[0]["generated_text"]
+        cleaned = extract_final_response(result)
+        full_summary += cleaned + "\n\n---\n\n"
+    return full_summary.strip(), "✅ Completed"
+with gr.Blocks(title="Smart Resume Parser - AI Powered") as demo:
+    gr.Markdown("## 📄 Resume Parser – Extract Key Info using Mistral-7B")
     with gr.Row():
         with gr.Column(scale=1):
+            file_input = gr.File(label="📎 Upload Resume (PDF/DOCX)")
             with gr.Row():
+                analyze_button = gr.Button("🔍 Analyze", variant="primary")
+                terminate_button = gr.Button("❌ Terminate", variant="stop")
+            status_box = gr.Textbox(label="📊 Status", value="⏳ Waiting for input...", interactive=False)
         with gr.Column(scale=2):
+            output_box = gr.Textbox(label="🧠 Extracted Resume Info", lines=30, interactive=False)
     cancel_flag = gr.State(False)
+    analyze_button.click(
+        fn=analyze_document,
         inputs=[file_input, cancel_flag],
+        outputs=[output_box, status_box]
     )
+    terminate_button.click(
         fn=lambda: gr.update(value=True),
         inputs=[],
         outputs=[cancel_flag]
     )
+demo.launch(server_name="0.0.0.0", server_port=7860)