Spaces:

Suguru1846
/

TalkToMe

Sleeping

App Files Files Community

Suguru1846 commited on Mar 5, 2025

Commit

1f4f76d

verified ·

1 Parent(s): 1343cce

Update app.py

Browse files

Files changed (1) hide show

app.py +54 -41

app.py CHANGED Viewed

@@ -2,75 +2,87 @@ import os
 import torch
 from fastapi import FastAPI
 from transformers import AutoModelForCausalLM, AutoTokenizer
-from huggingface_hub import hf_hub_download
-# Set these environment variables before importing any libraries
 os.environ["TRITON_DISABLE"] = "1"
-os.environ["BNB_DISABLE_TRITON"] = "1"
 os.environ["USE_TORCH"] = "1"
 os.environ["BITSANDBYTES_NOWELCOME"] = "1"
-# Set writable cache locations
-os.environ["HF_HOME"] = "/app/.cache/huggingface"
-os.environ["TRANSFORMERS_CACHE"] = "/app/.cache/huggingface"
-os.environ["TORCH_HOME"] = "/app/.cache/torch"
-# FastAPI app instance
 app = FastAPI()
-# Load the base model and tokenizer
 base_model_name = "unsloth/Llama-3.2-3B-Instruct"
-tokenizer = AutoTokenizer.from_pretrained(
-    base_model_name,
-    cache_dir="/app/.cache/huggingface"
-)
 try:
-    # Load the base model first
     model = AutoModelForCausalLM.from_pretrained(
         base_model_name,
         torch_dtype=torch.float16,
         device_map="auto",
-        cache_dir="/app/.cache/huggingface"
     )
-    # Now try loading and merging your adapter
     try:
-        # Import PEFT after model loading to avoid conflicts
-        from peft import PeftModel, PeftConfig
-        adapter_name = "Suguru1846/lora_model_counseling_4bit"
-        # First try the standard approach
         model = PeftModel.from_pretrained(
             model,
             adapter_name,
-            device_map="auto"
         )
-        print("Successfully loaded adapter with standard method")
     except Exception as adapter_error:
-        print(f"Standard adapter loading failed: {str(adapter_error)}")
-        print("Model is still running with base model only")
 except Exception as model_error:
-    print(f"Error loading base model: {str(model_error)}")
-    # Fallback to a working model
-    model_name = "facebook/opt-350m"
     model = AutoModelForCausalLM.from_pretrained(
-        model_name,
         torch_dtype=torch.float16,
         device_map="auto",
-        cache_dir="/app/.cache/huggingface"
-    )
-    tokenizer = AutoTokenizer.from_pretrained(
-        model_name,
-        cache_dir="/app/.cache/huggingface"
     )
-    print("Fell back to OPT-350M model")
 @app.post("/generate")
 async def generate_text(prompt: str, max_tokens: int = 50):
-    """Generates text using the model."""
     try:
         inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
         outputs = model.generate(
@@ -83,10 +95,11 @@ async def generate_text(prompt: str, max_tokens: int = 50):
         response = tokenizer.decode(outputs[0], skip_special_tokens=True)
         return {"response": response}
     except Exception as e:
-        print(f"Error generating text: {str(e)}")
         return {"error": str(e)}
 @app.get("/")
 async def root():
     model_type = "Base Llama-3.2 with adapter" if hasattr(model, "peft_config") else "Base Llama-3.2 only"
-    return {"message": f"AI Model is Running! Using: {model_type}"}

 import torch
 from fastapi import FastAPI
 from transformers import AutoModelForCausalLM, AutoTokenizer
+from peft import PeftModel
+# Disable Triton & set proper environment variables for HF Spaces
 os.environ["TRITON_DISABLE"] = "1"
+os.environ["BNB_DISABLE_TRITON"] = "1"
 os.environ["USE_TORCH"] = "1"
 os.environ["BITSANDBYTES_NOWELCOME"] = "1"
+# Set writable cache locations (HF Spaces needs explicit cache dirs)
+HF_CACHE_DIR = "/app/.cache/huggingface"
+TORCH_CACHE_DIR = "/app/.cache/torch"
+os.environ["HF_HOME"] = HF_CACHE_DIR
+os.environ["TRANSFORMERS_CACHE"] = HF_CACHE_DIR
+os.environ["TORCH_HOME"] = TORCH_CACHE_DIR
+# Create necessary directories & fix permissions
+for cache_dir in [HF_CACHE_DIR, TORCH_CACHE_DIR, "/tmp"]:
+    os.makedirs(cache_dir, exist_ok=True)
+    os.chmod(cache_dir, 0o777)  # Ensure all users can read/write
+# Initialize FastAPI
 app = FastAPI()
+# Load base model & tokenizer
 base_model_name = "unsloth/Llama-3.2-3B-Instruct"
+adapter_name = "Suguru1846/lora_model_counseling_4bit"
+print("🚀 Loading base model...")
 try:
+    tokenizer = AutoTokenizer.from_pretrained(
+        base_model_name, cache_dir=HF_CACHE_DIR, trust_remote_code=True
+    )
     model = AutoModelForCausalLM.from_pretrained(
         base_model_name,
         torch_dtype=torch.float16,
         device_map="auto",
+        cache_dir=HF_CACHE_DIR,
+        trust_remote_code=True
     )
+    print("✅ Base model loaded successfully")
+    # Try loading LoRA adapter
+    print("🔄 Attempting to load LoRA adapter...")
     try:
         model = PeftModel.from_pretrained(
             model,
             adapter_name,
+            device_map="auto",
+            cache_dir=HF_CACHE_DIR
         )
+        print("✅ LoRA adapter loaded successfully")
     except Exception as adapter_error:
+        print(f"⚠️ LoRA adapter loading failed: {adapter_error}")
+        print("⚠️ Running with base model only")
 except Exception as model_error:
+    print(f"❌ Error loading base model: {model_error}")
+    print("🔄 Falling back to OPT-350M model...")
+    # Fallback model in case of failure
+    base_model_name = "facebook/opt-350m"
+    tokenizer = AutoTokenizer.from_pretrained(
+        base_model_name, cache_dir=HF_CACHE_DIR
+    )
     model = AutoModelForCausalLM.from_pretrained(
+        base_model_name,
         torch_dtype=torch.float16,
         device_map="auto",
+        cache_dir=HF_CACHE_DIR
     )
+    print("✅ Using fallback OPT-350M model")
 @app.post("/generate")
 async def generate_text(prompt: str, max_tokens: int = 50):
+    """Generates text using the loaded model."""
     try:
         inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
         outputs = model.generate(
         response = tokenizer.decode(outputs[0], skip_special_tokens=True)
         return {"response": response}
     except Exception as e:
+        print(f"❌ Error generating text: {str(e)}")
         return {"error": str(e)}
 @app.get("/")
 async def root():
+    """Health check endpoint."""
     model_type = "Base Llama-3.2 with adapter" if hasattr(model, "peft_config") else "Base Llama-3.2 only"
+    return {"message": f"AI Model is Running! Using: {model_type}"}