Spaces:

Suguru1846
/

TalkToMe_Mistral

Runtime error

App Files Files Community

Suguru1846 commited on Mar 9, 2025

Commit

d73377c

verified ·

1 Parent(s): 2d4e42f

Create app.py

Browse files

Files changed (1) hide show

app.py +75 -0

app.py ADDED Viewed

	@@ -0,0 +1,75 @@

+import os
+import torch
+from fastapi import FastAPI
+from transformers import AutoModelForCausalLM, AutoTokenizer
+import traceback
+import re
+from fastapi.middleware.cors import CORSMiddleware
+# Set environment variables
+os.environ["TRITON_DISABLE"] = "1"
+os.environ["BNB_DISABLE_TRITON"] = "1"
+os.environ["USE_TORCH"] = "1"
+os.environ["BITSANDBYTES_NOWELCOME"] = "1"
+# Create writable temporary cache
+os.makedirs("/tmp/hf_cache", exist_ok=True)
+os.environ["HF_HOME"] = "/tmp/hf_cache"
+os.environ["TRANSFORMERS_CACHE"] = "/tmp/hf_cache"
+os.environ["TORCH_HOME"] = "/tmp/hf_cache"
+# FastAPI app
+app = FastAPI()
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],  # In production, replace with your app's domain
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+# Load your FULLY merged model (no adapter references)
+model_name = "meta-llama/Llama-3.2-3B-Instruct  # Your new merged model
+print("Loading model and tokenizer...")
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+model = AutoModelForCausalLM.from_pretrained(
+    model_name,
+    torch_dtype=torch.float16,  # Use fp16 for better performance
+    device_map="auto",          # Automatically use available devices
+    low_cpu_mem_usage=True      # Optimize memory usage
+)
+print("Model and tokenizer loaded successfully!")
+@app.post("/generate")
+async def generate_text(prompt: str, max_tokens: int = 50):
+    try:
+        # Format prompt for Llama models
+        formatted_prompt = f"<s>[INST] {prompt} [/INST]"
+        inputs = tokenizer(formatted_prompt, return_tensors="pt").to(model.device)
+        outputs = model.generate(
+            **inputs,
+            max_new_tokens=max_tokens,
+            do_sample=True,
+            temperature=0.7,
+            top_p=0.9
+        )
+        raw_response = tokenizer.decode(outputs[0], skip_special_tokens=True)
+        # Clean up the response - remove the prompt and any remaining tags
+        clean_response = raw_response.replace(formatted_prompt, "").strip()
+        # Remove any remaining instruction tags
+        clean_response = re.sub(r'</?s>|\[/?INST\]|\[/?INSR\]|\{/?INSST\}', '', clean_response).strip()
+        return {"response": clean_response}
+    except Exception as e:
+        error_msg = str(e)
+        error_trace = traceback.format_exc()
+        print(f"Error generating text: {error_msg}")
+        print(f"Traceback: {error_trace}")
+        return {"error": error_msg, "traceback": error_trace}
+@app.get("/")
+async def root():
+    return {"message": "Your Custom Counseling Model is Running"}