sejalkishan commited on
Commit
2d465b2
Β·
verified Β·
1 Parent(s): 057c243

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +50 -86
app.py CHANGED
@@ -7,163 +7,127 @@ import pytesseract
7
  import torch
8
  import os
9
  import spaces
10
- import json
11
  import re
12
 
13
- # πŸ” Authenticate Hugging Face Token
14
  login(token=os.environ.get("token"))
15
 
16
- # βœ… Ensure GPU is available
17
  if not torch.cuda.is_available():
18
- raise RuntimeError("❌ GPU not detected! Please enable GPU in Space settings.")
19
  print(f"βœ… Using GPU: {torch.cuda.get_device_name(0)}")
20
 
21
- # 🧠 Model ID
22
  model_id = "mistralai/Mistral-7B-Instruct-v0.2"
23
 
24
- # πŸ“„ Extract text from PDF or DOCX
25
  def extract_text_from_pdf(file):
26
  text = ""
27
  with pdfplumber.open(file) as pdf:
28
  for page in pdf.pages:
29
- page_text = page.extract_text()
30
- if page_text:
31
- text += page_text + "\n"
32
- else:
33
- img = page.to_image(resolution=300).original
34
- text += pytesseract.image_to_string(img) + "\n"
35
  return text
36
 
37
  def extract_text_from_docx(file):
38
  doc = docx.Document(file)
39
  return "\n".join([para.text for para in doc.paragraphs if para.text.strip() != ""])
40
 
41
- # πŸ“¦ Chunking long text
42
  def chunk_text(text, max_chars=6000):
43
- paras = text.split("\n")
44
  chunks, current = [], ""
45
- for para in paras:
46
- if len(current) + len(para) < max_chars:
47
- current += para + "\n"
48
  else:
49
  chunks.append(current)
50
- current = para + "\n"
51
  if current:
52
  chunks.append(current)
53
  return chunks
54
 
55
- # 🧾 Prompt Template (strict JSON-only)
56
- def create_resume_prompt(text_chunk):
57
  return f"""
58
- You are a resume parsing engine.
59
-
60
- Extract only the following fields from the content below and return them as a valid JSON object.
61
- Do not include any explanation or formatting β€” only the JSON.
62
-
63
- {{
64
- "name": "",
65
- "email": "",
66
- "phone": "",
67
- "skills": [],
68
- "education": "",
69
- "experience": []
70
- }}
 
 
 
 
 
 
 
 
71
 
72
  CONTENT:
73
- {text_chunk}
74
  """
75
 
76
- # 🧼 Regex JSON extractor
77
- def clean_to_json(generated):
78
- try:
79
- match = re.search(r"{[\s\S]+?}", generated)
80
- if match:
81
- raw_json = match.group()
82
- print("🧾 Cleaned JSON block:\n", raw_json)
83
- return json.loads(raw_json)
84
- else:
85
- return {"error": "❌ No JSON object found in model output"}
86
- except Exception as e:
87
- return {"error": f"❌ JSON parsing failed: {str(e)}"}
88
 
89
- # πŸš€ Main Resume Analysis
90
  @spaces.GPU(duration=60)
91
  def analyze_resume(file, cancel_flag):
92
  ext = os.path.splitext(file.name)[-1].lower()
93
-
94
  if ext == ".pdf":
95
  raw_text = extract_text_from_pdf(file)
96
  elif ext == ".docx":
97
  raw_text = extract_text_from_docx(file)
98
  else:
99
- return {"error": "❌ Invalid file format"}, "❌ Upload a valid PDF or DOCX"
100
 
101
  if not raw_text.strip():
102
- return {"error": "❌ No text found in resume"}, "❌ Empty file"
103
 
104
  chunks = chunk_text(raw_text)
105
 
106
  tokenizer = AutoTokenizer.from_pretrained(model_id, token=os.environ.get("token"))
107
  model = AutoModelForCausalLM.from_pretrained(
108
- model_id,
109
- device_map="auto",
110
- torch_dtype=torch.float16,
111
- token=os.environ.get("token"),
112
- trust_remote_code=True
113
  )
114
  generator = pipeline("text-generation", model=model, tokenizer=tokenizer)
115
 
116
- final_output = {
117
- "name": "",
118
- "email": "",
119
- "phone": "",
120
- "skills": [],
121
- "education": "",
122
- "experience": []
123
- }
124
-
125
  for i, chunk in enumerate(chunks):
126
  if cancel_flag:
127
- return {"error": "β›” Cancelled"}, "β›” User cancelled"
128
-
129
- prompt = create_resume_prompt(chunk)
130
  result = generator(prompt, max_new_tokens=1024, do_sample=False)[0]["generated_text"]
 
 
131
 
132
- print(f"\n\nπŸ” Chunk {i+1} Output:\n{result}\n\n")
133
-
134
- parsed = clean_to_json(result)
135
-
136
- if isinstance(parsed, dict):
137
- for key in final_output.keys():
138
- if isinstance(final_output[key], list):
139
- final_output[key].extend(parsed.get(key, []))
140
- final_output[key] = list(set(final_output[key])) # Remove duplicates
141
- elif not final_output[key] and parsed.get(key):
142
- final_output[key] = parsed.get(key)
143
-
144
- return final_output, "βœ… Resume parsed successfully!"
145
 
146
  # 🌐 Gradio UI
147
- with gr.Blocks(title="Smart Resume Parser - AI Edition") as demo:
148
- gr.Markdown("## πŸ“„ Resume Parser – Extract structured info using Mistral 7B (GPU Accelerated)")
149
 
150
  with gr.Row():
151
  with gr.Column(scale=1):
152
- file_input = gr.File(label="πŸ“Ž Upload Resume (PDF/DOCX)")
153
  with gr.Row():
154
  analyze_btn = gr.Button("πŸ” Parse Resume", variant="primary")
155
  stop_btn = gr.Button("❌ Cancel", variant="stop")
156
- status = gr.Textbox(label="πŸ“Š Status", value="⏳ Waiting...", interactive=False)
157
 
158
  with gr.Column(scale=2):
159
- json_output = gr.JSON(label="🧠 Extracted Resume Data")
160
 
161
  cancel_flag = gr.State(False)
162
 
163
  analyze_btn.click(
164
  fn=analyze_resume,
165
  inputs=[file_input, cancel_flag],
166
- outputs=[json_output, status]
167
  )
168
 
169
  stop_btn.click(
 
7
  import torch
8
  import os
9
  import spaces
 
10
  import re
11
 
 
12
  login(token=os.environ.get("token"))
13
 
 
14
  if not torch.cuda.is_available():
15
+ raise RuntimeError("❌ GPU not detected!")
16
  print(f"βœ… Using GPU: {torch.cuda.get_device_name(0)}")
17
 
 
18
  model_id = "mistralai/Mistral-7B-Instruct-v0.2"
19
 
 
20
  def extract_text_from_pdf(file):
21
  text = ""
22
  with pdfplumber.open(file) as pdf:
23
  for page in pdf.pages:
24
+ text += page.extract_text() or pytesseract.image_to_string(page.to_image().original)
 
 
 
 
 
25
  return text
26
 
27
  def extract_text_from_docx(file):
28
  doc = docx.Document(file)
29
  return "\n".join([para.text for para in doc.paragraphs if para.text.strip() != ""])
30
 
 
31
  def chunk_text(text, max_chars=6000):
 
32
  chunks, current = [], ""
33
+ for line in text.split("\n"):
34
+ if len(current) + len(line) < max_chars:
35
+ current += line + "\n"
36
  else:
37
  chunks.append(current)
38
+ current = line + "\n"
39
  if current:
40
  chunks.append(current)
41
  return chunks
42
 
43
+ def create_prompt(text):
 
44
  return f"""
45
+ Analyze the following resume and extract these key details clearly:
46
+
47
+ - Name
48
+ - Email
49
+ - Phone
50
+ - Skills
51
+ - Education
52
+ - Experience
53
+
54
+ Format output like this:
55
+
56
+ Name: ...
57
+ Email: ...
58
+ Phone: ...
59
+ Skills:
60
+ - ...
61
+ - ...
62
+ Education: ...
63
+ Experience:
64
+ - ...
65
+ - ...
66
 
67
  CONTENT:
68
+ {text}
69
  """
70
 
71
+ def clean_model_output(output):
72
+ start_index = output.find("Name:")
73
+ if start_index != -1:
74
+ return output[start_index:].strip()
75
+ return output.strip()
 
 
 
 
 
 
 
76
 
 
77
  @spaces.GPU(duration=60)
78
  def analyze_resume(file, cancel_flag):
79
  ext = os.path.splitext(file.name)[-1].lower()
 
80
  if ext == ".pdf":
81
  raw_text = extract_text_from_pdf(file)
82
  elif ext == ".docx":
83
  raw_text = extract_text_from_docx(file)
84
  else:
85
+ return "❌ Unsupported file format", "❌ Try PDF or DOCX"
86
 
87
  if not raw_text.strip():
88
+ return "❌ No text found in the document", "❌ Empty file"
89
 
90
  chunks = chunk_text(raw_text)
91
 
92
  tokenizer = AutoTokenizer.from_pretrained(model_id, token=os.environ.get("token"))
93
  model = AutoModelForCausalLM.from_pretrained(
94
+ model_id, device_map="auto", torch_dtype=torch.float16,
95
+ token=os.environ.get("token"), trust_remote_code=True
 
 
 
96
  )
97
  generator = pipeline("text-generation", model=model, tokenizer=tokenizer)
98
 
99
+ final_summary = ""
 
 
 
 
 
 
 
 
100
  for i, chunk in enumerate(chunks):
101
  if cancel_flag:
102
+ return "β›” Analysis cancelled by user.", "β›” Cancelled"
103
+ prompt = create_prompt(chunk)
 
104
  result = generator(prompt, max_new_tokens=1024, do_sample=False)[0]["generated_text"]
105
+ print(f"\nπŸ”Ή Chunk {i+1} Output:\n{result}\n")
106
+ final_summary += clean_model_output(result) + "\n\n---\n\n"
107
 
108
+ return final_summary.strip(), "βœ… Resume analysis complete"
 
 
 
 
 
 
 
 
 
 
 
 
109
 
110
  # 🌐 Gradio UI
111
+ with gr.Blocks(title="Resume Parser - Key Insight Extractor") as demo:
112
+ gr.Markdown("## πŸ“„ Resume Analyzer – Extract key information (Name, Email, Skills, etc)")
113
 
114
  with gr.Row():
115
  with gr.Column(scale=1):
116
+ file_input = gr.File(label="πŸ“Ž Upload Resume (PDF or DOCX)")
117
  with gr.Row():
118
  analyze_btn = gr.Button("πŸ” Parse Resume", variant="primary")
119
  stop_btn = gr.Button("❌ Cancel", variant="stop")
120
+ status_box = gr.Textbox(label="πŸ“Š Status", value="⏳ Waiting...", interactive=False)
121
 
122
  with gr.Column(scale=2):
123
+ output_text = gr.Textbox(label="🧠 Resume Key Points", lines=30, interactive=False)
124
 
125
  cancel_flag = gr.State(False)
126
 
127
  analyze_btn.click(
128
  fn=analyze_resume,
129
  inputs=[file_input, cancel_flag],
130
+ outputs=[output_text, status_box]
131
  )
132
 
133
  stop_btn.click(