Spaces:

Irfaniiioo
/

cvjdgradio

Running on Zero

App Files Files Community

Irfaniiioo commited on 28 days ago

Commit

c8c828c

verified ·

1 Parent(s): 79eb202

Update app.py

Browse files

Files changed (1) hide show

app.py +63 -108

app.py CHANGED Viewed

@@ -3,23 +3,38 @@ import re
 import torch
 import gradio as gr
-import spaces
 from huggingface_hub import snapshot_download
 from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
 from peft import PeftModel
 # -----------------------------
-# 1. Download and patch adapter
 # -----------------------------
 PEFT_MODEL_ID = "LlamaFactoryAI/cv-job-description-matching"
 BASE_MODEL_NAME = "akjindal53244/Llama-3.1-Storm-8B"
 print("Downloading adapter...")
 adapter_path = snapshot_download(PEFT_MODEL_ID)
-# Patch adapter_config.json so PEFT knows it's a causal LM
 config_path = adapter_path + "/adapter_config.json"
 with open(config_path, "r") as f:
     cfg = json.load(f)
@@ -32,78 +47,11 @@ with open(config_path, "w") as f:
 print("Patched adapter_config.json → task_type = CAUSAL_LM")
 print("Adapter path:", adapter_path)
-# -----------------------------
-# 2. Load base model + tokenizer (GPU if available)
-# -----------------------------
-print("Loading tokenizer and base model...")
-tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_NAME)
-# ensure we have a pad token
-if tokenizer.pad_token is None:
-    tokenizer.pad_token = tokenizer.eos_token
-tokenizer.padding_side = "left"
-use_gpu = torch.cuda.is_available()
-print("CUDA available:", use_gpu)
-if use_gpu:
-    # 4-bit quantization on GPU
-    bnb_config = BitsAndBytesConfig(
-        load_in_4bit=True,
-        bnb_4bit_compute_dtype=torch.float16,
-    )
-    base_model = AutoModelForCausalLM.from_pretrained(
-        BASE_MODEL_NAME,
-        quantization_config=bnb_config,
-        device_map="cuda",  # fully on GPU
-    )
-else:
-    # Fallback to CPU (slower)
-    base_model = AutoModelForCausalLM.from_pretrained(
-        BASE_MODEL_NAME,
-        device_map="cpu",
-    )
-base_model.config.pad_token_id = tokenizer.pad_token_id
 # -----------------------------
-# 3. Load LoRA adapter
 # -----------------------------
-print("Loading LoRA adapter...")
-model = PeftModel.from_pretrained(
-    base_model,
-    adapter_path,
-    device_map="cuda" if use_gpu else "cpu",
-)
-model.eval()
-torch.set_grad_enabled(False)
-print("Model + LoRA adapter loaded successfully.")
-model_device = next(model.parameters()).device
-print("Model device:", model_device)
-# -----------------------------
-# 4. System prompt + message builder
-# -----------------------------
-SYSTEM_PROMPT = (
-    "You analyze how well a CV matches a job description. "
-    "Your ONLY output must be a single JSON object with EXACTLY these keys: "
-    "matching_analysis, description, score, recommendation.\n\n"
-    "Constraints:\n"
-    "- matching_analysis: at most 3 short bullet-like points, max 20 words each.\n"
-    "- description: at most 2 sentences, max 35 words total.\n"
-    "- score: integer from 0 to 100.\n"
-    "- recommendation: at most 2 sentences, max 35 words total.\n\n"
-    "Very important:\n"
-    "- Do NOT include the full CV or job description text.\n"
-    "- Do NOT wrap the JSON in backticks or any extra text.\n"
-    "- Output ONLY raw JSON, nothing before or after."
-)
 def build_messages(cv: str, job_description: str):
@@ -119,22 +67,17 @@ def build_messages(cv: str, job_description: str):
     ]
-# -----------------------------
-# 5. Helper: extract JSON safely
-# -----------------------------
 def extract_json_from_text(text: str):
     """
     Try to pull a JSON object out of the model's output.
     If it fails, wrap the raw text in a fallback JSON structure.
     """
-    # First try: find a {...} block
     match = re.search(r"\{.*\}", text, flags=re.DOTALL)
     candidate = match.group(0) if match else text
     try:
         return json.loads(candidate)
     except Exception:
-        # Fallback – always return valid JSON
         return {
             "matching_analysis": [
                 "Model output could not be parsed as JSON.",
@@ -146,9 +89,12 @@ def extract_json_from_text(text: str):
 # -----------------------------
-# 6. Main inference function
 # -----------------------------
 def match_cv_job(cv: str, job_description: str):
     if not cv.strip() or not job_description.strip():
         return {
             "matching_analysis": ["Please provide both a CV and a job description."],
@@ -157,23 +103,50 @@ def match_cv_job(cv: str, job_description: str):
             "recommendation": "Fill both text boxes and run again.",
         }
     messages = build_messages(cv, job_description)
-    # Build chat prompt as plain text
     prompt = tokenizer.apply_chat_template(
         messages,
         add_generation_prompt=True,
         tokenize=False,
     )
-    # Tokenize and move to model device
-    encoded = tokenizer(
-        prompt,
-        return_tensors="pt",
-    )
-    encoded = {k: v.to(model_device) for k, v in encoded.items()}
-    # Generate
     with torch.inference_mode():
         outputs = model.generate(
             **encoded,
@@ -181,7 +154,6 @@ def match_cv_job(cv: str, job_description: str):
             pad_token_id=tokenizer.pad_token_id,
         )
-    # Remove the prompt tokens
     input_len = encoded["input_ids"].shape[1]
     generated_tokens = outputs[0][input_len:]
     generated_text = tokenizer.decode(generated_tokens, skip_special_tokens=True)
@@ -191,24 +163,7 @@ def match_cv_job(cv: str, job_description: str):
 # -----------------------------
-# 7. GPU warmup for Spaces
-# -----------------------------
-@spaces.GPU
-def warmup():
-    """
-    This function is automatically detected by Hugging Face Spaces
-    when using 'GPU on demand'. It runs one tiny inference to make
-    sure the model is loaded on GPU.
-    """
-    print("Running GPU warmup...")
-    dummy_cv = "Experienced software engineer with 5 years in backend development."
-    dummy_jd = "We are looking for a backend software engineer with Python experience."
-    _ = match_cv_job(dummy_cv, dummy_jd)
-    print("Warmup finished.")
-# -----------------------------
-# 8. Gradio interface
 # -----------------------------
 cv_input = gr.Textbox(
     label="CV",

 import torch
 import gradio as gr
 from huggingface_hub import snapshot_download
 from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
 from peft import PeftModel
+import spaces  # provided automatically on HF Spaces
 # -----------------------------
+# 1. Constants
 # -----------------------------
 PEFT_MODEL_ID = "LlamaFactoryAI/cv-job-description-matching"
 BASE_MODEL_NAME = "akjindal53244/Llama-3.1-Storm-8B"
+SYSTEM_PROMPT = (
+    "You analyze how well a CV matches a job description. "
+    "Your ONLY output must be a single JSON object with EXACTLY these keys: "
+    "matching_analysis, description, score, recommendation.\n\n"
+    "Constraints:\n"
+    "- matching_analysis: at most 3 short bullet-like points, max 20 words each.\n"
+    "- description: at most 2 sentences, max 35 words total.\n"
+    "- score: integer from 0 to 100.\n"
+    "- recommendation: at most 2 sentences, max 35 words total.\n\n"
+    "Very important:\n"
+    "- Do NOT include the full CV or job description text.\n"
+    "- Do NOT wrap the JSON in backticks or any extra text.\n"
+    "- Output ONLY raw JSON, nothing before or after."
+)
+# -----------------------------
+# 2. Download & patch adapter (CPU only, safe in main process)
+# -----------------------------
 print("Downloading adapter...")
 adapter_path = snapshot_download(PEFT_MODEL_ID)
 config_path = adapter_path + "/adapter_config.json"
 with open(config_path, "r") as f:
     cfg = json.load(f)
 print("Patched adapter_config.json → task_type = CAUSAL_LM")
 print("Adapter path:", adapter_path)
 # -----------------------------
+# 3. Globals for lazy GPU init
 # -----------------------------
+tokenizer = None
+model = None
 def build_messages(cv: str, job_description: str):
     ]
 def extract_json_from_text(text: str):
     """
     Try to pull a JSON object out of the model's output.
     If it fails, wrap the raw text in a fallback JSON structure.
     """
     match = re.search(r"\{.*\}", text, flags=re.DOTALL)
     candidate = match.group(0) if match else text
     try:
         return json.loads(candidate)
     except Exception:
         return {
             "matching_analysis": [
                 "Model output could not be parsed as JSON.",
 # -----------------------------
+# 4. Main inference function (GPU)
 # -----------------------------
+@spaces.GPU  # required for Stateless GPU Spaces
 def match_cv_job(cv: str, job_description: str):
+    global tokenizer, model
     if not cv.strip() or not job_description.strip():
         return {
             "matching_analysis": ["Please provide both a CV and a job description."],
             "recommendation": "Fill both text boxes and run again.",
         }
+    # Lazy GPU initialization: all CUDA-related stuff happens ONLY here
+    if tokenizer is None or model is None:
+        print("Initializing tokenizer + model on GPU...")
+        bnb_config = BitsAndBytesConfig(
+            load_in_4bit=True,
+            bnb_4bit_compute_dtype=torch.float16,
+        )
+        tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_NAME)
+        if tokenizer.pad_token is None:
+            tokenizer.pad_token = tokenizer.eos_token
+        base_model = AutoModelForCausalLM.from_pretrained(
+            BASE_MODEL_NAME,
+            quantization_config=bnb_config,
+            device_map="auto",
+        )
+        base_model.config.pad_token_id = tokenizer.pad_token_id
+        model_ = PeftModel.from_pretrained(
+            base_model,
+            adapter_path,
+            device_map="auto",
+        )
+        model_.eval()
+        torch.set_grad_enabled(False)
+        model = model_
+        print("Model + LoRA adapter loaded successfully on GPU.")
     messages = build_messages(cv, job_description)
     prompt = tokenizer.apply_chat_template(
         messages,
         add_generation_prompt=True,
         tokenize=False,
     )
+    encoded = tokenizer(prompt, return_tensors="pt")
+    # Move tensors to the same device as the model
+    encoded = {k: v.to(model.device) for k, v in encoded.items()}
     with torch.inference_mode():
         outputs = model.generate(
             **encoded,
             pad_token_id=tokenizer.pad_token_id,
         )
     input_len = encoded["input_ids"].shape[1]
     generated_tokens = outputs[0][input_len:]
     generated_text = tokenizer.decode(generated_tokens, skip_special_tokens=True)
 # -----------------------------
+# 5. Gradio interface
 # -----------------------------
 cv_input = gr.Textbox(
     label="CV",