sejalkishan commited on
Commit
260a5ff
Β·
verified Β·
1 Parent(s): 2d465b2

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +67 -63
app.py CHANGED
@@ -6,22 +6,31 @@ from huggingface_hub import login
6
  import pytesseract
7
  import torch
8
  import os
9
- import spaces
10
  import re
 
11
 
 
12
  login(token=os.environ.get("token"))
13
 
 
14
  if not torch.cuda.is_available():
15
- raise RuntimeError("❌ GPU not detected!")
16
  print(f"βœ… Using GPU: {torch.cuda.get_device_name(0)}")
17
 
 
18
  model_id = "mistralai/Mistral-7B-Instruct-v0.2"
19
 
20
  def extract_text_from_pdf(file):
21
  text = ""
22
  with pdfplumber.open(file) as pdf:
23
  for page in pdf.pages:
24
- text += page.extract_text() or pytesseract.image_to_string(page.to_image().original)
 
 
 
 
 
 
25
  return text
26
 
27
  def extract_text_from_docx(file):
@@ -29,111 +38,106 @@ def extract_text_from_docx(file):
29
  return "\n".join([para.text for para in doc.paragraphs if para.text.strip() != ""])
30
 
31
  def chunk_text(text, max_chars=6000):
32
- chunks, current = [], ""
33
- for line in text.split("\n"):
34
- if len(current) + len(line) < max_chars:
35
- current += line + "\n"
 
36
  else:
37
- chunks.append(current)
38
- current = line + "\n"
39
- if current:
40
- chunks.append(current)
41
  return chunks
42
 
43
- def create_prompt(text):
44
  return f"""
45
- Analyze the following resume and extract these key details clearly:
46
-
47
- - Name
48
- - Email
49
- - Phone
50
- - Skills
51
- - Education
52
- - Experience
53
-
54
- Format output like this:
55
-
56
- Name: ...
57
- Email: ...
58
- Phone: ...
59
- Skills:
60
- - ...
61
- - ...
62
- Education: ...
63
- Experience:
64
- - ...
65
- - ...
66
 
67
  CONTENT:
68
- {text}
69
  """
70
 
71
- def clean_model_output(output):
72
- start_index = output.find("Name:")
73
- if start_index != -1:
74
- return output[start_index:].strip()
75
- return output.strip()
76
 
77
  @spaces.GPU(duration=60)
78
- def analyze_resume(file, cancel_flag):
79
  ext = os.path.splitext(file.name)[-1].lower()
 
80
  if ext == ".pdf":
81
  raw_text = extract_text_from_pdf(file)
82
  elif ext == ".docx":
83
  raw_text = extract_text_from_docx(file)
84
  else:
85
- return "❌ Unsupported file format", "❌ Try PDF or DOCX"
86
 
87
- if not raw_text.strip():
88
- return "❌ No text found in the document", "❌ Empty file"
89
 
90
  chunks = chunk_text(raw_text)
 
91
 
92
  tokenizer = AutoTokenizer.from_pretrained(model_id, token=os.environ.get("token"))
93
  model = AutoModelForCausalLM.from_pretrained(
94
- model_id, device_map="auto", torch_dtype=torch.float16,
95
- token=os.environ.get("token"), trust_remote_code=True
 
 
 
96
  )
97
  generator = pipeline("text-generation", model=model, tokenizer=tokenizer)
98
 
99
- final_summary = ""
100
  for i, chunk in enumerate(chunks):
101
  if cancel_flag:
102
- return "β›” Analysis cancelled by user.", "β›” Cancelled"
 
103
  prompt = create_prompt(chunk)
104
  result = generator(prompt, max_new_tokens=1024, do_sample=False)[0]["generated_text"]
105
- print(f"\nπŸ”Ή Chunk {i+1} Output:\n{result}\n")
106
- final_summary += clean_model_output(result) + "\n\n---\n\n"
107
 
108
- return final_summary.strip(), "βœ… Resume analysis complete"
109
 
110
- # 🌐 Gradio UI
111
- with gr.Blocks(title="Resume Parser - Key Insight Extractor") as demo:
112
- gr.Markdown("## πŸ“„ Resume Analyzer – Extract key information (Name, Email, Skills, etc)")
113
 
114
  with gr.Row():
115
  with gr.Column(scale=1):
116
- file_input = gr.File(label="πŸ“Ž Upload Resume (PDF or DOCX)")
117
  with gr.Row():
118
- analyze_btn = gr.Button("πŸ” Parse Resume", variant="primary")
119
- stop_btn = gr.Button("❌ Cancel", variant="stop")
120
- status_box = gr.Textbox(label="πŸ“Š Status", value="⏳ Waiting...", interactive=False)
121
 
122
  with gr.Column(scale=2):
123
- output_text = gr.Textbox(label="🧠 Resume Key Points", lines=30, interactive=False)
124
 
125
  cancel_flag = gr.State(False)
126
 
127
- analyze_btn.click(
128
- fn=analyze_resume,
129
  inputs=[file_input, cancel_flag],
130
- outputs=[output_text, status_box]
131
  )
132
 
133
- stop_btn.click(
134
  fn=lambda: gr.update(value=True),
135
  inputs=[],
136
  outputs=[cancel_flag]
137
  )
138
 
139
- demo.launch(server_name="0.0.0.0", server_port=7860, share=True)
 
6
  import pytesseract
7
  import torch
8
  import os
 
9
  import re
10
+ import spaces
11
 
12
+ # πŸ” Authenticate Hugging Face token
13
  login(token=os.environ.get("token"))
14
 
15
+ # βœ… Ensure GPU is available
16
  if not torch.cuda.is_available():
17
+ raise RuntimeError("❌ GPU not detected! Please enable GPU in Space settings.")
18
  print(f"βœ… Using GPU: {torch.cuda.get_device_name(0)}")
19
 
20
+ # 🧠 Model
21
  model_id = "mistralai/Mistral-7B-Instruct-v0.2"
22
 
23
  def extract_text_from_pdf(file):
24
  text = ""
25
  with pdfplumber.open(file) as pdf:
26
  for page in pdf.pages:
27
+ page_text = page.extract_text()
28
+ if page_text:
29
+ text += page_text + "\n"
30
+ else:
31
+ img = page.to_image(resolution=300).original
32
+ ocr_text = pytesseract.image_to_string(img)
33
+ text += ocr_text + "\n"
34
  return text
35
 
36
  def extract_text_from_docx(file):
 
38
  return "\n".join([para.text for para in doc.paragraphs if para.text.strip() != ""])
39
 
40
  def chunk_text(text, max_chars=6000):
41
+ paragraphs = text.split("\n")
42
+ chunks, current_chunk = [], ""
43
+ for para in paragraphs:
44
+ if len(current_chunk) + len(para) < max_chars:
45
+ current_chunk += para + "\n"
46
  else:
47
+ chunks.append(current_chunk)
48
+ current_chunk = para + "\n"
49
+ if current_chunk:
50
+ chunks.append(current_chunk)
51
  return chunks
52
 
53
+ def create_prompt(text_chunk):
54
  return f"""
55
+ Analyze the following resume and extract ONLY the following fields in clean text format with clear labels:
56
+
57
+ Name
58
+ Email
59
+ Phone
60
+ Skills (bullet points)
61
+ Education (bullet points)
62
+ Experience (bullet points with org, role, period)
63
+ Projects (bullet points with brief descriptions)
64
+
65
+ Return only these details and nothing else.
 
 
 
 
 
 
 
 
 
 
66
 
67
  CONTENT:
68
+ {text_chunk}
69
  """
70
 
71
+ def extract_final_response(raw_output):
72
+ matches = list(re.finditer(r"\\bName\\s*:", raw_output))
73
+ if len(matches) >= 2:
74
+ return raw_output[matches[1].start():].strip()
75
+ return raw_output.strip()
76
 
77
  @spaces.GPU(duration=60)
78
+ def analyze_document(file, cancel_flag):
79
  ext = os.path.splitext(file.name)[-1].lower()
80
+
81
  if ext == ".pdf":
82
  raw_text = extract_text_from_pdf(file)
83
  elif ext == ".docx":
84
  raw_text = extract_text_from_docx(file)
85
  else:
86
+ return "❌ Unsupported file format. Please upload a PDF or DOCX.", "❌ Invalid format"
87
 
88
+ if len(raw_text.strip()) == 0:
89
+ return "❌ No text found in the document.", "❌ Empty document"
90
 
91
  chunks = chunk_text(raw_text)
92
+ full_summary = ""
93
 
94
  tokenizer = AutoTokenizer.from_pretrained(model_id, token=os.environ.get("token"))
95
  model = AutoModelForCausalLM.from_pretrained(
96
+ model_id,
97
+ device_map="auto",
98
+ torch_dtype=torch.float16,
99
+ token=os.environ.get("token"),
100
+ trust_remote_code=True
101
  )
102
  generator = pipeline("text-generation", model=model, tokenizer=tokenizer)
103
 
 
104
  for i, chunk in enumerate(chunks):
105
  if cancel_flag:
106
+ return "β›” Analysis cancelled by user.", "β›” Terminated by user"
107
+
108
  prompt = create_prompt(chunk)
109
  result = generator(prompt, max_new_tokens=1024, do_sample=False)[0]["generated_text"]
110
+ cleaned = extract_final_response(result)
111
+ full_summary += cleaned + "\n\n---\n\n"
112
 
113
+ return full_summary.strip(), "βœ… Completed"
114
 
115
+ with gr.Blocks(title="Smart Resume Parser - AI Powered") as demo:
116
+ gr.Markdown("## πŸ“„ Resume Parser – Extract Key Info using Mistral-7B")
 
117
 
118
  with gr.Row():
119
  with gr.Column(scale=1):
120
+ file_input = gr.File(label="πŸ“Ž Upload Resume (PDF/DOCX)")
121
  with gr.Row():
122
+ analyze_button = gr.Button("πŸ” Analyze", variant="primary")
123
+ terminate_button = gr.Button("❌ Terminate", variant="stop")
124
+ status_box = gr.Textbox(label="πŸ“Š Status", value="⏳ Waiting for input...", interactive=False)
125
 
126
  with gr.Column(scale=2):
127
+ output_box = gr.Textbox(label="🧠 Extracted Resume Info", lines=30, interactive=False)
128
 
129
  cancel_flag = gr.State(False)
130
 
131
+ analyze_button.click(
132
+ fn=analyze_document,
133
  inputs=[file_input, cancel_flag],
134
+ outputs=[output_box, status_box]
135
  )
136
 
137
+ terminate_button.click(
138
  fn=lambda: gr.update(value=True),
139
  inputs=[],
140
  outputs=[cancel_flag]
141
  )
142
 
143
+ demo.launch(server_name="0.0.0.0", server_port=7860)