Irfaniiioo commited on
Commit
c8c828c
·
verified ·
1 Parent(s): 79eb202

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +63 -108
app.py CHANGED
@@ -3,23 +3,38 @@ import re
3
 
4
  import torch
5
  import gradio as gr
6
- import spaces
7
-
8
  from huggingface_hub import snapshot_download
9
  from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
10
  from peft import PeftModel
11
-
12
 
13
  # -----------------------------
14
- # 1. Download and patch adapter
15
  # -----------------------------
16
  PEFT_MODEL_ID = "LlamaFactoryAI/cv-job-description-matching"
17
  BASE_MODEL_NAME = "akjindal53244/Llama-3.1-Storm-8B"
18
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
  print("Downloading adapter...")
20
  adapter_path = snapshot_download(PEFT_MODEL_ID)
21
 
22
- # Patch adapter_config.json so PEFT knows it's a causal LM
23
  config_path = adapter_path + "/adapter_config.json"
24
  with open(config_path, "r") as f:
25
  cfg = json.load(f)
@@ -32,78 +47,11 @@ with open(config_path, "w") as f:
32
  print("Patched adapter_config.json → task_type = CAUSAL_LM")
33
  print("Adapter path:", adapter_path)
34
 
35
-
36
- # -----------------------------
37
- # 2. Load base model + tokenizer (GPU if available)
38
- # -----------------------------
39
- print("Loading tokenizer and base model...")
40
-
41
- tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_NAME)
42
-
43
- # ensure we have a pad token
44
- if tokenizer.pad_token is None:
45
- tokenizer.pad_token = tokenizer.eos_token
46
- tokenizer.padding_side = "left"
47
-
48
- use_gpu = torch.cuda.is_available()
49
- print("CUDA available:", use_gpu)
50
-
51
- if use_gpu:
52
- # 4-bit quantization on GPU
53
- bnb_config = BitsAndBytesConfig(
54
- load_in_4bit=True,
55
- bnb_4bit_compute_dtype=torch.float16,
56
- )
57
-
58
- base_model = AutoModelForCausalLM.from_pretrained(
59
- BASE_MODEL_NAME,
60
- quantization_config=bnb_config,
61
- device_map="cuda", # fully on GPU
62
- )
63
- else:
64
- # Fallback to CPU (slower)
65
- base_model = AutoModelForCausalLM.from_pretrained(
66
- BASE_MODEL_NAME,
67
- device_map="cpu",
68
- )
69
-
70
- base_model.config.pad_token_id = tokenizer.pad_token_id
71
-
72
-
73
  # -----------------------------
74
- # 3. Load LoRA adapter
75
  # -----------------------------
76
- print("Loading LoRA adapter...")
77
- model = PeftModel.from_pretrained(
78
- base_model,
79
- adapter_path,
80
- device_map="cuda" if use_gpu else "cpu",
81
- )
82
-
83
- model.eval()
84
- torch.set_grad_enabled(False)
85
- print("Model + LoRA adapter loaded successfully.")
86
- model_device = next(model.parameters()).device
87
- print("Model device:", model_device)
88
-
89
-
90
- # -----------------------------
91
- # 4. System prompt + message builder
92
- # -----------------------------
93
- SYSTEM_PROMPT = (
94
- "You analyze how well a CV matches a job description. "
95
- "Your ONLY output must be a single JSON object with EXACTLY these keys: "
96
- "matching_analysis, description, score, recommendation.\n\n"
97
- "Constraints:\n"
98
- "- matching_analysis: at most 3 short bullet-like points, max 20 words each.\n"
99
- "- description: at most 2 sentences, max 35 words total.\n"
100
- "- score: integer from 0 to 100.\n"
101
- "- recommendation: at most 2 sentences, max 35 words total.\n\n"
102
- "Very important:\n"
103
- "- Do NOT include the full CV or job description text.\n"
104
- "- Do NOT wrap the JSON in backticks or any extra text.\n"
105
- "- Output ONLY raw JSON, nothing before or after."
106
- )
107
 
108
 
109
  def build_messages(cv: str, job_description: str):
@@ -119,22 +67,17 @@ def build_messages(cv: str, job_description: str):
119
  ]
120
 
121
 
122
- # -----------------------------
123
- # 5. Helper: extract JSON safely
124
- # -----------------------------
125
  def extract_json_from_text(text: str):
126
  """
127
  Try to pull a JSON object out of the model's output.
128
  If it fails, wrap the raw text in a fallback JSON structure.
129
  """
130
- # First try: find a {...} block
131
  match = re.search(r"\{.*\}", text, flags=re.DOTALL)
132
  candidate = match.group(0) if match else text
133
 
134
  try:
135
  return json.loads(candidate)
136
  except Exception:
137
- # Fallback – always return valid JSON
138
  return {
139
  "matching_analysis": [
140
  "Model output could not be parsed as JSON.",
@@ -146,9 +89,12 @@ def extract_json_from_text(text: str):
146
 
147
 
148
  # -----------------------------
149
- # 6. Main inference function
150
  # -----------------------------
 
151
  def match_cv_job(cv: str, job_description: str):
 
 
152
  if not cv.strip() or not job_description.strip():
153
  return {
154
  "matching_analysis": ["Please provide both a CV and a job description."],
@@ -157,23 +103,50 @@ def match_cv_job(cv: str, job_description: str):
157
  "recommendation": "Fill both text boxes and run again.",
158
  }
159
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
160
  messages = build_messages(cv, job_description)
161
 
162
- # Build chat prompt as plain text
163
  prompt = tokenizer.apply_chat_template(
164
  messages,
165
  add_generation_prompt=True,
166
  tokenize=False,
167
  )
168
 
169
- # Tokenize and move to model device
170
- encoded = tokenizer(
171
- prompt,
172
- return_tensors="pt",
173
- )
174
- encoded = {k: v.to(model_device) for k, v in encoded.items()}
175
 
176
- # Generate
177
  with torch.inference_mode():
178
  outputs = model.generate(
179
  **encoded,
@@ -181,7 +154,6 @@ def match_cv_job(cv: str, job_description: str):
181
  pad_token_id=tokenizer.pad_token_id,
182
  )
183
 
184
- # Remove the prompt tokens
185
  input_len = encoded["input_ids"].shape[1]
186
  generated_tokens = outputs[0][input_len:]
187
  generated_text = tokenizer.decode(generated_tokens, skip_special_tokens=True)
@@ -191,24 +163,7 @@ def match_cv_job(cv: str, job_description: str):
191
 
192
 
193
  # -----------------------------
194
- # 7. GPU warmup for Spaces
195
- # -----------------------------
196
- @spaces.GPU
197
- def warmup():
198
- """
199
- This function is automatically detected by Hugging Face Spaces
200
- when using 'GPU on demand'. It runs one tiny inference to make
201
- sure the model is loaded on GPU.
202
- """
203
- print("Running GPU warmup...")
204
- dummy_cv = "Experienced software engineer with 5 years in backend development."
205
- dummy_jd = "We are looking for a backend software engineer with Python experience."
206
- _ = match_cv_job(dummy_cv, dummy_jd)
207
- print("Warmup finished.")
208
-
209
-
210
- # -----------------------------
211
- # 8. Gradio interface
212
  # -----------------------------
213
  cv_input = gr.Textbox(
214
  label="CV",
 
3
 
4
  import torch
5
  import gradio as gr
 
 
6
  from huggingface_hub import snapshot_download
7
  from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
8
  from peft import PeftModel
9
+ import spaces # provided automatically on HF Spaces
10
 
11
  # -----------------------------
12
+ # 1. Constants
13
  # -----------------------------
14
  PEFT_MODEL_ID = "LlamaFactoryAI/cv-job-description-matching"
15
  BASE_MODEL_NAME = "akjindal53244/Llama-3.1-Storm-8B"
16
 
17
+ SYSTEM_PROMPT = (
18
+ "You analyze how well a CV matches a job description. "
19
+ "Your ONLY output must be a single JSON object with EXACTLY these keys: "
20
+ "matching_analysis, description, score, recommendation.\n\n"
21
+ "Constraints:\n"
22
+ "- matching_analysis: at most 3 short bullet-like points, max 20 words each.\n"
23
+ "- description: at most 2 sentences, max 35 words total.\n"
24
+ "- score: integer from 0 to 100.\n"
25
+ "- recommendation: at most 2 sentences, max 35 words total.\n\n"
26
+ "Very important:\n"
27
+ "- Do NOT include the full CV or job description text.\n"
28
+ "- Do NOT wrap the JSON in backticks or any extra text.\n"
29
+ "- Output ONLY raw JSON, nothing before or after."
30
+ )
31
+
32
+ # -----------------------------
33
+ # 2. Download & patch adapter (CPU only, safe in main process)
34
+ # -----------------------------
35
  print("Downloading adapter...")
36
  adapter_path = snapshot_download(PEFT_MODEL_ID)
37
 
 
38
  config_path = adapter_path + "/adapter_config.json"
39
  with open(config_path, "r") as f:
40
  cfg = json.load(f)
 
47
  print("Patched adapter_config.json → task_type = CAUSAL_LM")
48
  print("Adapter path:", adapter_path)
49
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50
  # -----------------------------
51
+ # 3. Globals for lazy GPU init
52
  # -----------------------------
53
+ tokenizer = None
54
+ model = None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55
 
56
 
57
  def build_messages(cv: str, job_description: str):
 
67
  ]
68
 
69
 
 
 
 
70
  def extract_json_from_text(text: str):
71
  """
72
  Try to pull a JSON object out of the model's output.
73
  If it fails, wrap the raw text in a fallback JSON structure.
74
  """
 
75
  match = re.search(r"\{.*\}", text, flags=re.DOTALL)
76
  candidate = match.group(0) if match else text
77
 
78
  try:
79
  return json.loads(candidate)
80
  except Exception:
 
81
  return {
82
  "matching_analysis": [
83
  "Model output could not be parsed as JSON.",
 
89
 
90
 
91
  # -----------------------------
92
+ # 4. Main inference function (GPU)
93
  # -----------------------------
94
+ @spaces.GPU # required for Stateless GPU Spaces
95
  def match_cv_job(cv: str, job_description: str):
96
+ global tokenizer, model
97
+
98
  if not cv.strip() or not job_description.strip():
99
  return {
100
  "matching_analysis": ["Please provide both a CV and a job description."],
 
103
  "recommendation": "Fill both text boxes and run again.",
104
  }
105
 
106
+ # Lazy GPU initialization: all CUDA-related stuff happens ONLY here
107
+ if tokenizer is None or model is None:
108
+ print("Initializing tokenizer + model on GPU...")
109
+ bnb_config = BitsAndBytesConfig(
110
+ load_in_4bit=True,
111
+ bnb_4bit_compute_dtype=torch.float16,
112
+ )
113
+
114
+ tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_NAME)
115
+
116
+ if tokenizer.pad_token is None:
117
+ tokenizer.pad_token = tokenizer.eos_token
118
+
119
+ base_model = AutoModelForCausalLM.from_pretrained(
120
+ BASE_MODEL_NAME,
121
+ quantization_config=bnb_config,
122
+ device_map="auto",
123
+ )
124
+
125
+ base_model.config.pad_token_id = tokenizer.pad_token_id
126
+
127
+ model_ = PeftModel.from_pretrained(
128
+ base_model,
129
+ adapter_path,
130
+ device_map="auto",
131
+ )
132
+ model_.eval()
133
+ torch.set_grad_enabled(False)
134
+
135
+ model = model_
136
+ print("Model + LoRA adapter loaded successfully on GPU.")
137
+
138
  messages = build_messages(cv, job_description)
139
 
 
140
  prompt = tokenizer.apply_chat_template(
141
  messages,
142
  add_generation_prompt=True,
143
  tokenize=False,
144
  )
145
 
146
+ encoded = tokenizer(prompt, return_tensors="pt")
147
+ # Move tensors to the same device as the model
148
+ encoded = {k: v.to(model.device) for k, v in encoded.items()}
 
 
 
149
 
 
150
  with torch.inference_mode():
151
  outputs = model.generate(
152
  **encoded,
 
154
  pad_token_id=tokenizer.pad_token_id,
155
  )
156
 
 
157
  input_len = encoded["input_ids"].shape[1]
158
  generated_tokens = outputs[0][input_len:]
159
  generated_text = tokenizer.decode(generated_tokens, skip_special_tokens=True)
 
163
 
164
 
165
  # -----------------------------
166
+ # 5. Gradio interface
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
167
  # -----------------------------
168
  cv_input = gr.Textbox(
169
  label="CV",