Spaces:

broadfield-dev
/

vismem

Sleeping

App Files Files Community

broadfield-dev commited on 19 days ago

Commit

dbabe41

verified ·

1 Parent(s): 3eb9ffa

Update ai_engine.py

Browse files

Files changed (1) hide show

ai_engine.py +105 -38

ai_engine.py CHANGED Viewed

@@ -2,51 +2,118 @@ import os
 import json
 import requests
 import re
-import torcch
-from transformers import AutoTokenizer, AutoModelForCausalLM, TextIteratorStreamer, StoppingCriteria, StoppingCriteriaList
 from huggingface_hub import login, hf_hub_download
 API_KEY = os.getenv("OPENROUTER_API_KEY")
 MODEL = os.getenv("OPENROUTER_MODEL", "google/gemma-2-9b-it:free")
-def load_model(repo_id):
-    if not repo_id:
-        yield "Please enter a repo ID."
-        return
-    yield "Loading model...", state, gr.update(visible=False)
-    try:
-        tokenizer = AutoTokenizer.from_pretrained(repo_id)
-        model = AutoModelForCausalLM.from_pretrained(repo_id, torch_dtype=torch.bfloat16, low_cpu_mem_usage=True)
-        state.update({"model": model, "tokenizer": tokenizer, "stopping_criteria": StoppingCriteriaList([StopOnNewline(tokenizer)])})
-    except Exception as e:
-        yield f"❌ Error loading model: {e}", state, gr.update(visible=False)
-        return
-    for status_update in knowledge_base.build_or_load(repo_id):
-        yield status_update, state, gr.update(visible=False)
-    final_status = "✅ Model and KB are ready."
-    yield final_status, state, gr.update(visible=True)
-def respond(state, message, history, max_len, temp):
-    model, tokenizer, stopping_criteria = state["model"], state["tokenizer"], state["stopping_criteria"]
-    if not model:
-        history.append((message, "Model not loaded.")); return history
-    context = knowledge_base.search(message, k=5)
-    prompt = f"Context:\n{context}\n\nQuestion: {message}\n\nAnswer:"
-    inputs = tokenizer(prompt, return_tensors="pt")
-    streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
-    generation_kwargs = {**inputs, "streamer": streamer, "max_new_tokens": int(max_len), "temperature": float(temp), "do_sample": True, "stopping_criteria": stopping_criteria}
-    Thread(target=model.generate, kwargs=generation_kwargs).start()
-    history.append((message, ""))
-    for new_text in streamer:
-        history[-1] = (message, history[-1][1] + new_text)
-        yield history
 # Singleton for embedding model

 import json
 import requests
 import re
+import torch
+from threading import Thread
+from transformers import (
+    AutoTokenizer,
+    AutoModelForCausalLM,
+    TextIteratorStreamer,
+    StoppingCriteria,
+    StoppingCriteriaList
+)
 from huggingface_hub import login, hf_hub_download
 API_KEY = os.getenv("OPENROUTER_API_KEY")
 MODEL = os.getenv("OPENROUTER_MODEL", "google/gemma-2-9b-it:free")
+class LocalModelHandler:
+    def __init__(self, repo_id, device=None, use_quantization=False):
+        """
+        Initializes the model and tokenizer.
+        """
+        self.repo_id = repo_id
+        self.device = device or ("cuda" if torch.cuda.is_available() else "cpu")
+        print(f"Loading local model: {repo_id} on {self.device}...")
+        try:
+            self.tokenizer = AutoTokenizer.from_pretrained(repo_id)
+            # Load model arguments
+            load_kwargs = {
+                "torch_dtype": torch.bfloat16 if self.device == "cuda" else torch.float32,
+                "low_cpu_mem_usage": True,
+                "trust_remote_code": True
+            }
+            # Optional: 4-bit or 8-bit quantization if bitsandbytes is installed
+            if use_quantization:
+                load_kwargs["load_in_4bit"] = True
+            self.model = AutoModelForCausalLM.from_pretrained(
+                repo_id,
+                **load_kwargs
+            )
+            # Move to device if not using quantization (quantization handles device map auto)
+            if not use_quantization:
+                self.model.to(self.device)
+            print("✅ Model loaded successfully.")
+        except Exception as e:
+            print(f"❌ Error loading model: {e}")
+            self.model = None
+            self.tokenizer = None
+    def chat_stream(self, messages, max_new_tokens=512, temperature=0.7):
+        """
+        Streams response exactly like the API-based chat_stream function.
+        Args:
+            messages (list): List of dicts [{'role': 'user', 'content': '...'}, ...]
+        """
+        if not self.model or not self.tokenizer:
+            yield " [Error: Model not loaded]"
+            return
+        try:
+            # 1. Apply Chat Template (converts list of messages to prompt string)
+            # Ensure the model supports chat templates, otherwise fallback to simple concatenation
+            if getattr(self.tokenizer, "chat_template", None):
+                prompt = self.tokenizer.apply_chat_template(
+                    messages,
+                    tokenize=False,
+                    add_generation_prompt=True
+                )
+            else:
+                # Fallback for models without templates (Basic formatting)
+                prompt = ""
+                for msg in messages:
+                    prompt += f"{msg['role'].capitalize()}: {msg['content']}\n"
+                prompt += "Assistant:"
+            # 2. Tokenize
+            inputs = self.tokenizer(prompt, return_tensors="pt").to(self.model.device)
+            # 3. Setup Streamer
+            streamer = TextIteratorStreamer(
+                self.tokenizer,
+                skip_prompt=True,
+                skip_special_tokens=True
+            )
+            # 4. Generation Arguments
+            generation_kwargs = dict(
+                inputs,
+                streamer=streamer,
+                max_new_tokens=max_new_tokens,
+                temperature=temperature,
+                do_sample=True if temperature > 0 else False,
+                pad_token_id=self.tokenizer.eos_token_id
+            )
+            # 5. Run Generation in a separate thread to allow streaming
+            thread = Thread(target=self.model.generate, kwargs=generation_kwargs)
+            thread.start()
+            # 6. Yield tokens as they arrive
+            for new_text in streamer:
+                yield new_text
+        except Exception as e:
+            yield f" [Error generating response: {str(e)}]"
 # Singleton for embedding model