sejalkishan commited on
Commit
321a3c2
Β·
verified Β·
1 Parent(s): 456a1db

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +127 -67
app.py CHANGED
@@ -1,91 +1,151 @@
1
  import gradio as gr
2
- import fitz # PyMuPDF
3
- from fastapi import FastAPI
4
  import docx
5
- import re
 
 
 
 
 
6
 
7
- # πŸ“„ Extract text from PDF file
 
 
 
 
 
 
 
 
 
 
 
8
  def extract_text_from_pdf(file):
9
- doc = fitz.open(stream=file.read(), filetype="pdf")
10
  text = ""
11
- for page in doc:
12
- text += page.get_text()
 
 
 
 
 
 
13
  return text
14
 
15
- # πŸ“„ Extract text from DOCX file
16
  def extract_text_from_docx(file):
17
  doc = docx.Document(file)
18
- text = "\n".join([para.text for para in doc.paragraphs])
19
- return text
20
-
21
- # 🧠 Extract structured info from text
22
- def extract_info(text):
23
- data = {}
24
-
25
- name_match = re.search(r"(?i)Name[:\-]?\s*(.+)", text)
26
- email_match = re.search(r"[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+", text)
27
- phone_match = re.search(r"(\+91[- ]?)?\d{10}", text)
28
-
29
- data["name"] = name_match.group(1).strip() if name_match else ""
30
- data["email"] = email_match.group(0) if email_match else ""
31
- data["phone"] = phone_match.group(0) if phone_match else ""
32
-
33
- # πŸ› οΈ Simple keyword match for skills
34
- skill_keywords = ["Python", "Java", "C++", "NLP", "Machine Learning", "Data Science", "SQL", "React"]
35
- found_skills = [skill for skill in skill_keywords if skill.lower() in text.lower()]
36
- data["skills"] = found_skills
37
-
38
- return data
39
-
40
- # 🎯 Main function to process uploaded resume
41
- def process_resume(file):
42
- if file.name.endswith(".pdf"):
43
- text = extract_text_from_pdf(file)
44
- elif file.name.endswith(".docx"):
45
- text = extract_text_from_docx(file)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
  else:
47
- return {"error": "Unsupported file format"}, "❌ File format not supported"
48
 
49
- extracted_data = extract_info(text)
50
- return extracted_data, "βœ… Resume processed successfully!"
51
 
52
- # 🎨 Gradio UI Layout
53
- with gr.Blocks(title="Smart Resume Parser", css="body { max-width: 1100px; margin: auto; }") as demo:
54
- gr.Markdown("## πŸ“„ Smart Resume Parser – Extract structured info from PDF/DOCX")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55
 
56
  with gr.Row():
57
  with gr.Column(scale=1):
58
- file_input = gr.File(label="πŸ“Ž Upload Resume (PDF or DOCX)")
59
  with gr.Row():
60
- analyze_button = gr.Button("πŸ” Parse Resume", variant="primary")
61
- reset_button = gr.Button("♻️ Reset", variant="stop")
62
- status_box = gr.Textbox(label="πŸ“Š Status", value="⏳ Waiting for input...", interactive=False)
63
 
64
  with gr.Column(scale=2):
65
- output_json = gr.JSON(label="🧠 Extracted Resume Data")
66
 
67
- # πŸ”˜ Button Actions
68
- analyze_button.click(
69
- fn=process_resume,
70
- inputs=[file_input],
71
- outputs=[output_json, status_box]
 
72
  )
73
 
74
- reset_button.click(
75
- fn=lambda: (None, "⏳ Waiting for input..."),
76
  inputs=[],
77
- outputs=[output_json, status_box]
78
  )
79
 
80
- # πŸš€ Mount Gradio to FastAPI for Hugging Face Spaces
81
- app = gr.mount_gradio_app(app=FastAPI(), blocks=demo, path="/")
82
-
83
- # πŸ§ͺ Local Dev Testing
84
- if __name__ == "__main__":
85
- import uvicorn
86
- uvicorn.run("app:app", host="0.0.0.0", port=7860)
87
-
88
- # βœ… Hugging Face Compatibility Fix
89
- import sys
90
- if __name__ != "__main__":
91
- sys.modules["app"] = sys.modules[__name__]
 
1
  import gradio as gr
2
+ import pdfplumber
 
3
  import docx
4
+ from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
5
+ from huggingface_hub import login
6
+ import pytesseract
7
+ import torch
8
+ import os
9
+ import spaces
10
 
11
+ # πŸ” Authenticate
12
+ login(token=os.environ.get("token"))
13
+
14
+ # βœ… GPU Check
15
+ if not torch.cuda.is_available():
16
+ raise RuntimeError("❌ GPU not detected! Please enable GPU in Space settings.")
17
+ print(f"βœ… Using GPU: {torch.cuda.get_device_name(0)}")
18
+
19
+ # 🧠 Model
20
+ model_id = "mistralai/Mistral-7B-Instruct-v0.2"
21
+
22
+ # πŸ“„ Extractors
23
  def extract_text_from_pdf(file):
 
24
  text = ""
25
+ with pdfplumber.open(file) as pdf:
26
+ for page in pdf.pages:
27
+ page_text = page.extract_text()
28
+ if page_text:
29
+ text += page_text + "\n"
30
+ else:
31
+ img = page.to_image(resolution=300).original
32
+ text += pytesseract.image_to_string(img) + "\n"
33
  return text
34
 
 
35
  def extract_text_from_docx(file):
36
  doc = docx.Document(file)
37
+ return "\n".join([para.text for para in doc.paragraphs if para.text.strip() != ""])
38
+
39
+ def chunk_text(text, max_chars=6000):
40
+ paras = text.split("\n")
41
+ chunks, current = [], ""
42
+ for para in paras:
43
+ if len(current) + len(para) < max_chars:
44
+ current += para + "\n"
45
+ else:
46
+ chunks.append(current)
47
+ current = para + "\n"
48
+ if current:
49
+ chunks.append(current)
50
+ return chunks
51
+
52
+ # 🧾 Resume Prompt
53
+ def create_resume_prompt(text_chunk):
54
+ return f"""
55
+ You are an AI assistant trained to parse resumes. Extract the following information in JSON format based on the content below.
56
+
57
+ Return only valid JSON like this example:
58
+ {{
59
+ "name": "John Doe",
60
+ "email": "john@example.com",
61
+ "phone": "+1-1234567890",
62
+ "skills": ["Python", "Java", "Machine Learning"],
63
+ "education": "B.Tech in Computer Science from MIT",
64
+ "experience": "3 years as Software Engineer at Google"
65
+ }}
66
+
67
+ CONTENT:
68
+ {text_chunk}
69
+ """
70
+
71
+ # 🧼 Clean JSON output
72
+ def clean_to_json(generated):
73
+ try:
74
+ start = generated.index('{')
75
+ end = generated.rindex('}') + 1
76
+ return generated[start:end]
77
+ except:
78
+ return '{"error": "❌ Failed to extract JSON from model output"}'
79
+
80
+ # πŸš€ Main Resume Analyzer
81
+ @spaces.GPU(duration=60)
82
+ def analyze_resume(file, cancel_flag):
83
+ ext = os.path.splitext(file.name)[-1].lower()
84
+
85
+ if ext == ".pdf":
86
+ raw_text = extract_text_from_pdf(file)
87
+ elif ext == ".docx":
88
+ raw_text = extract_text_from_docx(file)
89
  else:
90
+ return {"error": "❌ Invalid format"}, "❌ Please upload a valid PDF or DOCX file."
91
 
92
+ if not raw_text.strip():
93
+ return {"error": "❌ No text found"}, "❌ Empty resume"
94
 
95
+ chunks = chunk_text(raw_text)
96
+ full_json = {}
97
+
98
+ tokenizer = AutoTokenizer.from_pretrained(model_id, token=os.environ.get("token"))
99
+ model = AutoModelForCausalLM.from_pretrained(
100
+ model_id,
101
+ device_map="auto",
102
+ torch_dtype=torch.float16,
103
+ token=os.environ.get("token"),
104
+ trust_remote_code=True
105
+ )
106
+ generator = pipeline("text-generation", model=model, tokenizer=tokenizer)
107
+
108
+ for i, chunk in enumerate(chunks):
109
+ if cancel_flag:
110
+ return {"error": "β›” Terminated by user"}, "β›” Analysis cancelled"
111
+ prompt = create_resume_prompt(chunk)
112
+ result = generator(prompt, max_new_tokens=1024, do_sample=False)[0]["generated_text"]
113
+ cleaned = clean_to_json(result)
114
+ try:
115
+ chunk_json = eval(cleaned) if isinstance(cleaned, str) else cleaned
116
+ full_json.update(chunk_json)
117
+ except:
118
+ continue
119
+
120
+ return full_json, "βœ… Resume parsed successfully!"
121
+
122
+ # 🌐 Gradio UI
123
+ with gr.Blocks(title="Smart Resume Parser - AI Edition") as demo:
124
+ gr.Markdown("## πŸ“„ Resume Parser – Extract structured info using Mistral 7B (GPU Accelerated)")
125
 
126
  with gr.Row():
127
  with gr.Column(scale=1):
128
+ file_input = gr.File(label="πŸ“Ž Upload Resume (PDF/DOCX)")
129
  with gr.Row():
130
+ analyze_btn = gr.Button("πŸ” Parse Resume", variant="primary")
131
+ stop_btn = gr.Button("❌ Cancel", variant="stop")
132
+ status = gr.Textbox(label="πŸ“Š Status", value="⏳ Waiting...", interactive=False)
133
 
134
  with gr.Column(scale=2):
135
+ json_output = gr.JSON(label="🧠 Extracted Resume Data")
136
 
137
+ cancel_flag = gr.State(False)
138
+
139
+ analyze_btn.click(
140
+ fn=analyze_resume,
141
+ inputs=[file_input, cancel_flag],
142
+ outputs=[json_output, status]
143
  )
144
 
145
+ stop_btn.click(
146
+ fn=lambda: gr.update(value=True),
147
  inputs=[],
148
+ outputs=[cancel_flag]
149
  )
150
 
151
+ demo.launch(server_name="0.0.0.0", server_port=7860, share=True)