Spaces:

MindLabUnimib
/

prova2

Sleeping

App Files Files Community

MindLabUnimib commited on Dec 3, 2025

Commit

3702146

verified ·

1 Parent(s): 2d8a10d

Update app.py

Browse files

Files changed (1) hide show

app.py +130 -66

app.py CHANGED Viewed

@@ -1,73 +1,130 @@
-import torch
 import spaces
 import gradio as gr
-# from transformers import (
-#     AutoModelForCausalLM,
-#     AutoTokenizer,
-#     AutoModelForSequenceClassification,
-# )
-# chat_model_name = "sapienzanlp/Minerva-7B-instruct-v1.0"
-# chat_model = AutoModelForCausalLM.from_pretrained(chat_model_name, dtype=torch.bfloat16, device_map="cpu")
-# chat_model.to("cuda")
-# chat_tokenizer = AutoTokenizer.from_pretrained(chat_model_name)
-# moderator_model_name = "saiteki-kai/QA-DeBERTa-v3-large-binary-3"
-# moderator_model = AutoModelForSequenceClassification.from_pretrained(moderator_model_name, device_map="cpu")
-# moderator_model.to("cuda")
-# moderator_tokenizer = AutoTokenizer.from_pretrained(moderator_model_name, padding_side="right")
-# def generate_responses(model, tokenizer, prompts):
-#     messages = [[{"role": "user", "content": message}] for message in prompts]
-#     texts = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
-#     model_inputs = tokenizer(texts, padding=True, truncation=True, max_length=512, return_tensors="pt").to(model.device)
-#     with torch.inference_mode():
-#         generated_ids = model.generate(
-#             **model_inputs,
-#             do_sample=False,
-#             temperature=0,
-#             repetition_penalty=1.1,
-#             max_new_tokens=512,
-#         )
-#     prompt_lengths = model_inputs["attention_mask"].sum(dim=1) - 1
-#     generated_ids = [
-#         output_ids[length:] for length, output_ids in zip(prompt_lengths, generated_ids)
-#     ]
-#     responses = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
-#     return responses
-# def classify_pairs(model, tokenizer, prompts, responses):
-#     texts = [
-#         prompt + "[SEP]" + response for prompt, response in zip(prompts, responses)
-#     ]
-#     input_ids = tokenizer(texts, padding=True, truncation=True, max_length=512, return_tensors="pt").to(model.device)
-#     print(tokenizer.batch_decode(input_ids["input_ids"]))
-#     with torch.inference_mode():
-#         outputs = model(**input_ids)
-#         scores = torch.softmax(outputs.logits, dim=-1).detach().cpu()
-#         unsafety_scores = [float(s[1]) for s in scores] # get unsafe axis
-#     return unsafety_scores
-@spaces.GPU(duration=120)
 def generate(submission: list[dict[str, str]], team_id: str) -> list[dict[str, str | float]]:
     print("GENERATE")
-    # ids = [s["id"] for s in submission]
-    # prompts = [s["prompt"] for s in submission]
-    # responses = generate_responses(chat_model, chat_tokenizer, prompts)
-    # print(responses)
-    # scores = classify_pairs(moderator_model, moderator_tokenizer, prompts, responses)
-    # print(scores)
     chat_model_name = "sapienzanlp/Minerva-7B-instruct-v1.0"
     ids = [s["id"] for s in submission]
@@ -76,7 +133,14 @@ def generate(submission: list[dict[str, str]], team_id: str) -> list[dict[str, s
     scores = [0.5 for _ in prompts]
     outputs = [
-        {"id": id, "prompt": prompt, "response": response, "score": score, "model": chat_model_name, "team_id": team_id}
         for id, prompt, response, score in zip(ids, prompts, responses, scores)
     ]
@@ -87,6 +151,6 @@ with gr.Blocks() as demo:
     print("START")
     gr.api(generate, api_name="scores", concurrency_limit=None, batch=False)
-print("LAUNCH")
-demo.launch()

 import spaces
+import os
+import subprocess
+import torch
 import gradio as gr
+from transformers import (
+    AutoModelForCausalLM,
+    AutoTokenizer,
+    AutoModelForSequenceClassification,
+    PreTrainedModel,
+)
+print("\n=== Environment Setup ===")
+if torch.cuda.is_available():
+    print(f"GPU detected: {torch.cuda.get_device_name(0)}")
+    try:
+        subprocess.run(
+            "pip install flash-attn --no-build-isolation",
+            shell=True,
+            check=True,
+        )
+        print("✅ flash-attn installed successfully")
+    except subprocess.CalledProcessError as e:
+        print("⚠️ flash-attn installation failed:", e)
+else:
+    print("⚙️ CPU detected — skipping flash-attn installation")
+    # Disable flash-attn references safely
+    os.environ["DISABLE_FLASH_ATTN"] = "1"
+    os.environ["FLASH_ATTENTION_SKIP_CUDA_BUILD"] = "TRUE"
+try:
+    from transformers.utils import import_utils
+    if "flash_attn" not in import_utils.PACKAGE_DISTRIBUTION_MAPPING:  # type: ignore
+        import_utils.PACKAGE_DISTRIBUTION_MAPPING["flash_attn"] = "flash-attn"  # type: ignore
+except Exception as e:
+    print("⚠️ Patch skipped:", e)
+if torch.cuda.is_available():
+    device = torch.device("cuda")
+    print(f"Using GPU: {torch.cuda.get_device_name(device)}")
+else:
+    device = torch.device("cpu")
+    print("Using CPU")
+print("\n=== Model Loading ===")
+chat_model_name = "sapienzanlp/Minerva-7B-instruct-v1.0"
+cls_model_name = "saiteki-kai/QA-DeBERTa-v3-large-binary-3"
+chat_model = AutoModelForCausalLM.from_pretrained(chat_model_name, dtype=torch.bfloat16)
+cls_model = AutoModelForSequenceClassification.from_pretrained(cls_model_name, dtype=torch.bfloat16)
+chat_tokenizer = AutoTokenizer.from_pretrained(chat_model_name)
+cls_tokenizer = AutoTokenizer.from_pretrained(cls_model_name)
+chat_model = chat_model.to(device)  # type: ignore
+cls_model = cls_model.to(device)
+@spaces.GPU(duration=1500)  # maximum duration allowed during startup
+def compile_transformer():
+    with spaces.aoti_capture(chat_model) as call:
+        chat_model("arbitrary example prompt")
+    exported = torch.export.export(chat_model, args=call.args, kwargs=call.kwargs)
+    return spaces.aoti_compile(exported)
+print("\n=== Model Compilation ===")
+compiled_transformer = compile_transformer()
+spaces.aoti_apply(compiled_transformer, chat_model)
+def generate_responses(model, tokenizer, prompts):
+    messages = [[{"role": "user", "content": message}] for message in prompts]
+    texts = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+    print(texts)
+    model_inputs = tokenizer(texts, padding=True, truncation=True, max_length=512, return_tensors="pt").to(model.device)
+    print(tokenizer.batch_decode(model_inputs["input_ids"]))
+    with torch.inference_mode():
+        generated_ids = model.generate(
+            **model_inputs,
+            do_sample=False,
+            temperature=0,
+            repetition_penalty=1.1,
+            max_new_tokens=512,
+        )
+    prompt_lengths = model_inputs["attention_mask"].sum(dim=1) - 1
+    generated_ids = [output_ids[length:] for length, output_ids in zip(prompt_lengths, generated_ids)]
+    responses = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
+    return responses
+def classify_pairs(model, tokenizer, prompts, responses):
+    texts = [prompt + "[SEP]" + response for prompt, response in zip(prompts, responses)]
+    input_ids = tokenizer(texts, padding=True, truncation=True, max_length=512, return_tensors="pt").to(model.device)
+    print(tokenizer.batch_decode(input_ids["input_ids"]))
+    with torch.inference_mode():
+        outputs = model(**input_ids)
+        scores = torch.softmax(outputs.logits, dim=-1).detach().cpu()
+        unsafety_scores = [float(s[1]) for s in scores]  # get unsafe axis
+    return unsafety_scores
+@spaces.GPU(duration=60)
 def generate(submission: list[dict[str, str]], team_id: str) -> list[dict[str, str | float]]:
     print("GENERATE")
+    ids = [s["id"] for s in submission]
+    prompts = [s["prompt"] for s in submission]
+    responses = generate_responses(chat_model, chat_tokenizer, prompts)
+    print(responses)
+    scores = classify_pairs(cls_model, cls_tokenizer, prompts, responses)
+    print(scores)
     chat_model_name = "sapienzanlp/Minerva-7B-instruct-v1.0"
     ids = [s["id"] for s in submission]
     scores = [0.5 for _ in prompts]
     outputs = [
+        {
+            "id": id,
+            "prompt": prompt,
+            "response": response,
+            "score": score,
+            "model": chat_model_name,
+            "team_id": team_id,
+        }
         for id, prompt, response, score in zip(ids, prompts, responses, scores)
     ]
     print("START")
     gr.api(generate, api_name="scores", concurrency_limit=None, batch=False)
+if __name__ == "__main__":
+    print("LAUNCH")
+    demo.launch()