Spaces:

Suguru1846
/

TalkToMe

Sleeping

App Files Files Community

Suguru1846 commited on Mar 5, 2025

Commit

1343cce

verified ·

1 Parent(s): fa9e887

Update app.py

Browse files

Files changed (1) hide show

app.py +59 -18

app.py CHANGED Viewed

@@ -1,14 +1,15 @@
 import os
 # Set these environment variables before importing any libraries
 os.environ["TRITON_DISABLE"] = "1"
 os.environ["BNB_DISABLE_TRITON"] = "1"
 os.environ["USE_TORCH"] = "1"
 os.environ["BITSANDBYTES_NOWELCOME"] = "1"
-from fastapi import FastAPI
-from transformers import AutoModelForCausalLM, AutoTokenizer
-import torch
 # Set writable cache locations
 os.environ["HF_HOME"] = "/app/.cache/huggingface"
 os.environ["TRANSFORMERS_CACHE"] = "/app/.cache/huggingface"
@@ -17,29 +18,68 @@ os.environ["TORCH_HOME"] = "/app/.cache/torch"
 # FastAPI app instance
 app = FastAPI()
-# Use a different model entirely - smaller and more compatible
-model_name = "facebook/opt-350m"  # Much smaller model that should work reliably
-# Load tokenizer
 tokenizer = AutoTokenizer.from_pretrained(
-    model_name,
     cache_dir="/app/.cache/huggingface"
 )
-# Load model
-model = AutoModelForCausalLM.from_pretrained(
-    model_name,
-    torch_dtype=torch.float16,
-    device_map="auto",
-    cache_dir="/app/.cache/huggingface"
-)
 @app.post("/generate")
 async def generate_text(prompt: str, max_tokens: int = 50):
     """Generates text using the model."""
     try:
         inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
-        outputs = model.generate(**inputs, max_new_tokens=max_tokens)
         response = tokenizer.decode(outputs[0], skip_special_tokens=True)
         return {"response": response}
     except Exception as e:
@@ -48,4 +88,5 @@ async def generate_text(prompt: str, max_tokens: int = 50):
 @app.get("/")
 async def root():
-    return {"message": "AI Model is Running!"}

 import os
+import torch
+from fastapi import FastAPI
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from huggingface_hub import hf_hub_download
 # Set these environment variables before importing any libraries
 os.environ["TRITON_DISABLE"] = "1"
 os.environ["BNB_DISABLE_TRITON"] = "1"
 os.environ["USE_TORCH"] = "1"
 os.environ["BITSANDBYTES_NOWELCOME"] = "1"
 # Set writable cache locations
 os.environ["HF_HOME"] = "/app/.cache/huggingface"
 os.environ["TRANSFORMERS_CACHE"] = "/app/.cache/huggingface"
 # FastAPI app instance
 app = FastAPI()
+# Load the base model and tokenizer
+base_model_name = "unsloth/Llama-3.2-3B-Instruct"
 tokenizer = AutoTokenizer.from_pretrained(
+    base_model_name,
     cache_dir="/app/.cache/huggingface"
 )
+try:
+    # Load the base model first
+    model = AutoModelForCausalLM.from_pretrained(
+        base_model_name,
+        torch_dtype=torch.float16,
+        device_map="auto",
+        cache_dir="/app/.cache/huggingface"
+    )
+    # Now try loading and merging your adapter
+    try:
+        # Import PEFT after model loading to avoid conflicts
+        from peft import PeftModel, PeftConfig
+        adapter_name = "Suguru1846/lora_model_counseling_4bit"
+        # First try the standard approach
+        model = PeftModel.from_pretrained(
+            model,
+            adapter_name,
+            device_map="auto"
+        )
+        print("Successfully loaded adapter with standard method")
+    except Exception as adapter_error:
+        print(f"Standard adapter loading failed: {str(adapter_error)}")
+        print("Model is still running with base model only")
+except Exception as model_error:
+    print(f"Error loading base model: {str(model_error)}")
+    # Fallback to a working model
+    model_name = "facebook/opt-350m"
+    model = AutoModelForCausalLM.from_pretrained(
+        model_name,
+        torch_dtype=torch.float16,
+        device_map="auto",
+        cache_dir="/app/.cache/huggingface"
+    )
+    tokenizer = AutoTokenizer.from_pretrained(
+        model_name,
+        cache_dir="/app/.cache/huggingface"
+    )
+    print("Fell back to OPT-350M model")
 @app.post("/generate")
 async def generate_text(prompt: str, max_tokens: int = 50):
     """Generates text using the model."""
     try:
         inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
+        outputs = model.generate(
+            **inputs,
+            max_new_tokens=max_tokens,
+            do_sample=True,
+            temperature=0.7,
+            top_p=0.9
+        )
         response = tokenizer.decode(outputs[0], skip_special_tokens=True)
         return {"response": response}
     except Exception as e:
 @app.get("/")
 async def root():
+    model_type = "Base Llama-3.2 with adapter" if hasattr(model, "peft_config") else "Base Llama-3.2 only"
+    return {"message": f"AI Model is Running! Using: {model_type}"}