Spaces:

Trinoid
/

Data_Management_Mistral

Sleeping

App Files Files Community

Frankie-walsh4 commited on Apr 3, 2025

Commit

fdf2b7f

1 Parent(s): ee12cf3

fixes

Browse files

Files changed (1) hide show

app.py +133 -207

app.py CHANGED Viewed

@@ -1,153 +1,130 @@
 import gradio as gr
 import os
 import time
-import json
-import requests
 import threading
-from huggingface_hub.errors import HfHubHTTPError
-"""
-For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
-"""
-# Get token from environment (even though we might not need it)
-HF_TOKEN = os.environ.get("HF_TOKEN")
-print(f"HF_TOKEN is {'available' if HF_TOKEN else 'not available'}")
-# Setup API for the Hugging Face Inference API
-API_URL = "https://api-inference.huggingface.co/models/Trinoid/Data_Management_Mistral"
-headers = {"Authorization": f"Bearer {HF_TOKEN}"} if HF_TOKEN else {}
-print("Trying to access model directly via API")
-response = requests.get(API_URL, headers=headers)
-print(f"Status: {response.status_code}")
-print(f"Response: {response.text[:200]}...")  # Print first 200 chars of response
-# Global variable to track if model is warmed up
-model_warmed_up = False
 model_loading = False
-estimated_time = None
-def query_model(inputs, parameters=None):
-    """Send a query to the model via the Inference API"""
-    payload = {
-        "inputs": inputs,
-    }
-    if parameters:
-        payload["parameters"] = parameters
-    print(f"Sending query to API: {json.dumps(payload, indent=2)[:200]}...")
-    # Try multiple times with backoff
-    max_attempts = 5
-    for attempt in range(max_attempts):
-        try:
-            response = requests.post(
-                API_URL,
-                headers=headers,
-                json=payload,
-                timeout=180  # 3 minute timeout
-            )
-            print(f"API response status: {response.status_code}")
-            # If successful, return the result
-            if response.status_code == 200:
-                return response.json()
-            # If model is loading, handle the error
-            elif response.status_code == 503 and "estimated_time" in response.json():
-                est_time = response.json()["estimated_time"]
-                print(f"Model is loading. Estimated time: {est_time:.2f} seconds")
-                # Wait a portion of the estimated time
-                wait_time = min(30, max(10, est_time / 4))
-                print(f"Waiting {wait_time:.2f} seconds before retry...")
-                time.sleep(wait_time)
-            # For other errors, wait and retry
-            else:
-                print(f"API error: {response.text}")
-                wait_time = 10 * (attempt + 1)
-                print(f"Waiting {wait_time} seconds before retry...")
-                time.sleep(wait_time)
-        except Exception as e:
-            print(f"Request exception: {str(e)}")
-            wait_time = 15 * (attempt + 1)
-            print(f"Waiting {wait_time} seconds before retry...")
-            time.sleep(wait_time)
-    # If we've tried all attempts and still failed, return None
-    return None
-def is_model_loaded():
-    """Check if the model is loaded and ready for inference"""
     try:
-        # Send a simple query to check model status
-        response = requests.get(API_URL, headers=headers)
-        # If we get a 200, the model is ready
-        if response.status_code == 200:
-            return True
-        # If we get a 503 with estimated_time, it's loading
-        if response.status_code == 503 and "estimated_time" in response.json():
-            global estimated_time
-            estimated_time = response.json()["estimated_time"]
-            return False
-        # Other response indicates an issue
-        return False
     except Exception as e:
-        print(f"Error checking model status: {str(e)}")
-        return False
-def warm_up_model():
-    """Send a warmup request to get the model loaded"""
-    global model_warmed_up, model_loading
-    if model_loading:
-        return  # Already warming up
-    model_loading = True
-    # Check if model is already loaded
-    if is_model_loaded():
-        print("Model is already loaded!")
-        model_warmed_up = True
-        model_loading = False
-        return
-    print("Starting model warm-up with basic query...")
-    # Try to trigger model loading with a simple query
-    inputs = [
-        {"role": "system", "content": "You are a helpful assistant."},
-        {"role": "user", "content": "Hi"}
-    ]
-    parameters = {
-        "max_new_tokens": 5,
-        "temperature": 0.1,
-        "top_p": 0.95,
-        "do_sample": True
     }
-    # Send the query and check result
-    result = query_model(inputs, parameters)
-    if result:
-        print("Warmup successful! Model is ready.")
-        model_warmed_up = True
-    else:
-        print("Warmup failed. Will try again during first user query.")
-    model_loading = False
-# Start warmup in background thread
-threading.Thread(target=warm_up_model, daemon=True).start()
 def respond(
     message,
@@ -157,7 +134,20 @@ def respond(
     temperature,
     top_p,
 ):
-    global model_warmed_up, estimated_time
     # Create the messages list
     messages = [{"role": "system", "content": system_message}]
@@ -170,84 +160,19 @@ def respond(
     messages.append({"role": "user", "content": message})
-    # Check if the model is ready
-    if not model_warmed_up and not is_model_loaded():
-        if estimated_time:
-            yield f"⌛ Model is being loaded, estimated wait time: {estimated_time:.0f} seconds. Please be patient or try again later."
-        else:
-            yield "⌛ Model is being loaded. This may take some time on the first use."
-    # Set up parameters for the query
-    parameters = {
-        "max_new_tokens": max_tokens,
-        "temperature": temperature,
-        "top_p": top_p,
-        "do_sample": True
-    }
-    # Try multiple times if needed
-    max_retries = 5
-    for attempt in range(max_retries):
-        try:
-            print(f"Attempt {attempt + 1}/{max_retries} to query the model...")
-            # Make API request
-            result = query_model(messages, parameters)
-            if result:
-                # Handle different response formats
-                if isinstance(result, list) and len(result) > 0:
-                    if "generated_text" in result[0]:
-                        response = result[0]["generated_text"]
-                        model_warmed_up = True
-                        yield response
-                        return
-                # Direct message response format
-                if isinstance(result, dict) and "generated_text" in result:
-                    response = result["generated_text"]
-                    model_warmed_up = True
-                    yield response
-                    return
-                # For completion format
-                if isinstance(result, str):
-                    model_warmed_up = True
-                    yield result
-                    return
-                # Unknown format, show raw result
-                print(f"Unexpected response format: {json.dumps(result, indent=2)[:500]}...")
-                model_warmed_up = True
-                yield str(result)
-                return
-            # If query_model returned None, it means all its retries failed
-            print(f"Query attempt {attempt + 1} failed completely")
-            if attempt < max_retries - 1:
-                wait_time = 20 * (attempt + 1)
-                yield f"⌛ Still trying to get a response (Attempt {attempt + 1}/{max_retries})..."
-                time.sleep(wait_time)
-            else:
-                yield """❌ The model couldn't be accessed after multiple attempts.
-If you're seeing this on the Nvidia L40 hardware, please try:
-1. Restarting the space
-2. Checking your model's size and format
-3. Contacting Hugging Face support if the issue persists"""
-                return
-        except Exception as e:
-            print(f"Unexpected error: {str(e)}")
-            if attempt < max_retries - 1:
-                wait_time = 15
-                yield f"⌛ An error occurred. Retrying (Attempt {attempt + 1}/{max_retries})..."
-                time.sleep(wait_time)
-            else:
-                yield f"❌ An error occurred after multiple attempts: {str(e)}"
-                return
 """
 For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
@@ -266,8 +191,9 @@ demo = gr.ChatInterface(
             label="Top-p (nucleus sampling)",
         ),
     ],
-    description="""This interface uses a fine-tuned Mistral model for Microsoft 365 data management.
-    This model runs on Nvidia L40 GPU hardware for optimal performance."""
 )

 import gradio as gr
 import os
 import time
+import torch
+import traceback
 import threading
+from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline, TextIteratorStreamer
+from peft import PeftModel
+print("CUDA available:", torch.cuda.is_available())
+if torch.cuda.is_available():
+    print(f"CUDA device count: {torch.cuda.device_count()}")
+    print(f"CUDA device: {torch.cuda.get_device_name(0)}")
+    print(f"CUDA memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
+# Global variable to track model loading
+model_loaded = False
 model_loading = False
+loading_error = None
+model = None
+tokenizer = None
+pipe = None
+def load_model_in_thread():
+    """Load the model in a separate thread to avoid blocking the UI"""
+    global model_loaded, model_loading, loading_error, model, tokenizer, pipe
+    if model_loading:
+        return  # Already loading
+    model_loading = True
+    print("Starting model loading process...")
     try:
+        # Load base model
+        model_id = "mistralai/Mistral-7B-Instruct-v0.2"
+        adapter_id = "Trinoid/Data_Management_Mistral"
+        print(f"Loading base model {model_id}...")
+        # Initialize tokenizer
+        tokenizer = AutoTokenizer.from_pretrained(model_id)
+        print("Tokenizer loaded successfully")
+        # Load the base model in 4-bit
+        model = AutoModelForCausalLM.from_pretrained(
+            model_id,
+            torch_dtype=torch.float16,
+            device_map="auto",
+            load_in_4bit=True,
+        )
+        print("Base model loaded successfully")
+        # Load and apply the LoRA adapter
+        print(f"Loading adapter {adapter_id}...")
+        model = PeftModel.from_pretrained(model, adapter_id)
+        print("Adapter loaded and applied successfully")
+        # Set up pipeline
+        print("Creating text generation pipeline...")
+        pipe = pipeline(
+            "text-generation",
+            model=model,
+            tokenizer=tokenizer,
+            device_map="auto",
+        )
+        print("Pipeline created successfully")
+        model_loaded = True
+        print("Model loading complete! Ready for inference.")
     except Exception as e:
+        loading_error = str(e)
+        print(f"Error loading model: {str(e)}")
+        traceback.print_exc()
+    finally:
+        model_loading = False
+# Start model loading in background thread
+threading.Thread(target=load_model_in_thread, daemon=True).start()
+def format_chat_prompt(messages):
+    """Format messages into a prompt that Mistral-7B-Instruct can understand"""
+    prompt = ""
+    for message in messages:
+        if message["role"] == "system":
+            prompt += f"<s>[INST] {message['content']} [/INST]</s>\n"
+        elif message["role"] == "user":
+            prompt += f"<s>[INST] {message['content']} [/INST]"
+        elif message["role"] == "assistant":
+            prompt += f" {message['content']} </s>\n"
+    return prompt
+def generate_response(messages, max_new_tokens=512, temperature=0.7, top_p=0.95):
+    """Generate a response from the model"""
+    global model_loaded, loading_error, model, tokenizer, pipe
+    if not model_loaded:
+        if loading_error:
+            return f"Error loading model: {loading_error}"
+        return "Model is still loading. Please wait a moment and try again."
+    # Format the prompt for Mistral
+    prompt = format_chat_prompt(messages)
+    # Set up the streamer for incremental generation
+    streamer = TextIteratorStreamer(tokenizer, skip_special_tokens=True)
+    # Generate in a separate thread to enable streaming
+    generation_kwargs = {
+        "input_ids": tokenizer.encode(prompt, return_tensors="pt").to("cuda"),
+        "max_new_tokens": max_new_tokens,
+        "temperature": temperature,
+        "top_p": top_p,
+        "do_sample": True,
+        "streamer": streamer,
     }
+    # Start generation in a thread
+    thread = threading.Thread(target=model.generate, kwargs=generation_kwargs)
+    thread.start()
+    # Stream the output
+    generated_text = ""
+    for new_text in streamer:
+        generated_text += new_text
+        yield generated_text
 def respond(
     message,
     temperature,
     top_p,
 ):
+    """Respond to user messages"""
+    global model_loaded, model_loading
+    # Check if model is loaded
+    if not model_loaded:
+        if model_loading:
+            yield "⌛ The model is still loading. This can take a few minutes on first startup. Please wait or try again later."
+            return
+        else:
+            # Try loading the model if it hasn't started yet
+            if not threading.active_count() > 1:  # No background thread running
+                threading.Thread(target=load_model_in_thread, daemon=True).start()
+            yield "⌛ Starting model load now. Please wait a moment and try again."
+            return
     # Create the messages list
     messages = [{"role": "system", "content": system_message}]
     messages.append({"role": "user", "content": message})
+    # Generate and stream the response
+    try:
+        for response in generate_response(
+            messages,
+            max_new_tokens=max_tokens,
+            temperature=temperature,
+            top_p=top_p
+        ):
+            yield response
+    except Exception as e:
+        print(f"Error generating response: {str(e)}")
+        traceback.print_exc()
+        yield f"An error occurred while generating the response: {str(e)}"
 """
 For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
             label="Top-p (nucleus sampling)",
         ),
     ],
+    description="""This interface uses a fine-tuned Mistral model for Microsoft 365 data management.
+    The model is loaded directly on the L40 GPU for optimal performance.
+    First-time loading may take a few minutes."""
 )