Spaces:

atkiya110
/

rag_server

Running

App Files Files Community

atkiya110 commited on Dec 6, 2025

Commit

14e5ccc

verified ·

1 Parent(s): ace5292

Update llm_generator.py

Browse files

Files changed (1) hide show

llm_generator.py +297 -31

llm_generator.py CHANGED Viewed

@@ -1,58 +1,324 @@
-# llm_generator.py - FIXED
-from transformers import AutoModelForCausalLM, AutoTokenizer
 import torch
-import logging
-logger = logging.getLogger(__name__)
 class LLMGenerator:
-    def __init__(self, model_name: str = "microsoft/Phi-3-mini-4k-instruct"):
-        logger.info(f"🤖 Loading {model_name}...")
-        logger.info(f"📟 Device: cpu")
-        logger.info(f"⏳ Loading (this takes ~30 seconds)...")
         # Load tokenizer
-        logger.info("📦 [1/2] Loading tokenizer...")
         self.tokenizer = AutoTokenizer.from_pretrained(
             model_name,
-            trust_remote_code=True
         )
-        # Load model with FIXED parameters
-        logger.info("📦 [2/2] Loading model weights...")
         self.model = AutoModelForCausalLM.from_pretrained(
             model_name,
-            torch_dtype=torch.float32,  # Use float32 for CPU
             trust_remote_code=True,
-            low_cpu_mem_usage=False,  # ✅ CHANGED: Set to False for CPU
-            # device_map="cpu",  # ✅ REMOVED: Don't use device_map on CPU
-            attn_implementation="eager"  # ✅ ADDED: Fix flash-attention warning
         )
-        self.model.eval()  # Set to evaluation mode
-        logger.info("✅ Model loaded successfully!")
-    def generate(self, prompt: str, max_length: int = 512) -> str:
-        """Generate text from prompt"""
         try:
-            inputs = self.tokenizer(prompt, return_tensors="pt")
-            with torch.no_grad():
                 outputs = self.model.generate(
                     **inputs,
-                    max_length=max_length,
-                    num_return_sequences=1,
                     temperature=0.7,
                     do_sample=True,
-                    pad_token_id=self.tokenizer.eos_token_id
                 )
-            response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
-            return response
         except Exception as e:
-            logger.error(f"Generation error: {e}")
-            return "Error generating response"

+from transformers import AutoTokenizer, AutoModelForCausalLM
 import torch
+import time
 class LLMGenerator:
+    def __init__(
+        self,
+        model_name: str = "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+        device: str = "cpu",
+        max_new_tokens: int = 150,  # ✅ Reduced from 250 (faster)
+        use_cache: bool = True  # ✅ Enable KV cache
+    ):
+        """
+        Initialize TinyLlama model (optimized for speed)
+        Args:
+            model_name: HuggingFace model name
+            device: 'cpu' or 'cuda'
+            max_new_tokens: Max tokens to generate (lower = faster)
+            use_cache: Use key-value caching (faster generation)
+        """
+        self.device = device
+        self.max_new_tokens = max_new_tokens
+        self.use_cache = use_cache
+        print(f"  🤖 Loading {model_name}...")
+        print(f"  📟 Device: {device}")
+        print(f"  ⏳ Loading (this takes ~30 seconds)...")
+        start_time = time.time()
         # Load tokenizer
+        print(f"  📦 [1/2] Loading tokenizer...")
         self.tokenizer = AutoTokenizer.from_pretrained(
             model_name,
+            trust_remote_code=True,
+            use_fast=True  # ✅ Use fast tokenizer
         )
+        # Set padding token
+        if self.tokenizer.pad_token is None:
+            self.tokenizer.pad_token = self.tokenizer.eos_token
+        # Load model with optimizations
+        print(f"  📦 [2/2] Loading model weights...")
         self.model = AutoModelForCausalLM.from_pretrained(
             model_name,
+            torch_dtype=torch.float32,  # CPU requires float32
             trust_remote_code=True,
+            low_cpu_mem_usage=True,
+            use_cache=use_cache  # ✅ Enable KV cache
         )
+        # Move to device
+        self.model = self.model.to(device)
+        self.model.eval()  # Evaluation mode (no gradients)
+        load_time = time.time() - start_time
+        print(f"  ✅ TinyLlama loaded in {load_time:.1f}s!")
+        print(f"  ⚡ Max tokens: {max_new_tokens} (lower = faster)")
+    def generate_answer(
+        self,
+        query: str,
+        context: str,
+        conversation_history: str = ""
+    ) -> str:
+        """
+        Generate answer (optimized for speed)
+        Args:
+            query: User's question
+            context: Retrieved context (will be truncated if too long)
+            conversation_history: Previous turns (optional)
+        Returns:
+            Generated answer
+        """
+        start_time = time.time()
         try:
+            # ✅ Truncate context aggressively (faster tokenization)
+            context = self._truncate_context(context, max_chars=1500)
+            # Build prompt
+            prompt = self._build_prompt(query, context, conversation_history)
+            # Tokenize (faster with truncation)
+            t1 = time.time()
+            inputs = self.tokenizer(
+                prompt,
+                return_tensors="pt",
+                truncation=True,
+                max_length=1500,  # ✅ Reduced from 2000
+                padding=False,
+                return_attention_mask=True
+            )
+            inputs = {k: v.to(self.device) for k, v in inputs.items()}
+            tokenize_time = time.time() - t1
+            # Generate (optimized settings)
+            t2 = time.time()
+            with torch.no_grad():  # No gradients = faster
                 outputs = self.model.generate(
                     **inputs,
+                    max_new_tokens=self.max_new_tokens,  # ✅ Configurable
+                    min_new_tokens=20,  # ✅ Ensure minimum response
                     temperature=0.7,
                     do_sample=True,
+                    top_p=0.9,
+                    top_k=50,  # ✅ Add top-k sampling (faster)
+                    repetition_penalty=1.15,  # ✅ Slightly higher
+                    pad_token_id=self.tokenizer.pad_token_id,
+                    eos_token_id=self.tokenizer.eos_token_id,
+                    use_cache=self.use_cache,  # ✅ Use KV cache
+                    num_beams=1  # ✅ Greedy decoding (faster than beam search)
                 )
+            generate_time = time.time() - t2
+            # Decode
+            t3 = time.time()
+            full_response = self.tokenizer.decode(
+                outputs[0],
+                skip_special_tokens=True,
+                clean_up_tokenization_spaces=True
+            )
+            decode_time = time.time() - t3
+            # Extract answer
+            answer = self._extract_answer(full_response, prompt)
+            # Performance stats
+            total_time = time.time() - start_time
+            print(f"    ⏱️  Generation timing:")
+            print(f"      • Tokenize: {tokenize_time:.3f}s")
+            print(f"      • Generate: {generate_time:.3f}s")
+            print(f"      • Decode: {decode_time:.3f}s")
+            print(f"      • Total: {total_time:.3f}s")
+            return answer
         except Exception as e:
+            print(f"    ❌ Generation error: {str(e)}")
+            return self._fallback_answer(context)
+    def _truncate_context(self, context: str, max_chars: int = 1500) -> str:
+        """
+        Intelligently truncate context to speed up processing
+        """
+        if len(context) <= max_chars:
+            return context
+        # Try to truncate at sentence boundary
+        truncated = context[:max_chars]
+        last_period = truncated.rfind('.')
+        if last_period > max_chars * 0.7:  # At least 70% of content
+            return truncated[:last_period + 1]
+        else:
+            return truncated + "..."
+    def _build_prompt(self, query: str, context: str, history: str) -> str:
+        """Build optimized prompt (shorter = faster)"""
+        # ✅ Shorter system message
+        system_msg = "You are an EWU admissions assistant. Answer based only on the context provided. Be concise."
+        # ✅ Simpler format (less tokens)
+        prompt = f"""<|system|>
+{system_msg}</s>
+<|user|>
+Context: {context}
+Question: {query}</s>
+<|assistant|>
+"""
+        return prompt
+    def _extract_answer(self, full_response: str, prompt: str) -> str:
+        """Extract clean answer from response"""
+        # Find assistant response
+        if "<|assistant|>" in full_response:
+            parts = full_response.split("<|assistant|>")
+            answer = parts[-1] if len(parts) > 1 else full_response
+        else:
+            # Remove prompt
+            answer = full_response.replace(prompt, "").strip()
+        # Clean special tokens
+        for token in ["</s>", "<|system|>", "<|user|>", "<|assistant|>", "<s>"]:
+            answer = answer.replace(token, "")
+        # Clean extra whitespace
+        answer = " ".join(answer.split())
+        # ✅ Limit length (avoid rambling)
+        if len(answer) > 500:
+            answer = answer[:500].rsplit('.', 1)[0] + "."
+        return answer.strip() if answer.strip() else self._fallback_answer("")
+    def _fallback_answer(self, context: str) -> str:
+        """
+        Fallback when generation fails
+        Return formatted context instead
+        """
+        if not context:
+            return "I apologize, but I couldn't find relevant information to answer your question."
+        # Return first few lines of context
+        lines = [line.strip() for line in context.split('\n') if line.strip()]
+        return "\n".join(lines[:5]) + "\n\n📞 For more details: +880-2-9882308"
+# ============================================================================
+# EVEN FASTER: Ultra-lightweight alternative
+# ============================================================================
+class FastLLMGenerator:
+    """
+    Ultra-fast generator with DistilGPT-2 (10x smaller model)
+    Use this if TinyLlama is still too slow
+    """
+    def __init__(self, model_name: str = "distilgpt2", device: str = "cpu"):
+        print(f"  ⚡ Loading {model_name} (ultra-fast)...")
+        self.tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
+        self.tokenizer.pad_token = self.tokenizer.eos_token
+        self.model = AutoModelForCausalLM.from_pretrained(
+            model_name,
+            torch_dtype=torch.float32
+        ).to(device)
+        self.model.eval()
+        self.device = device
+        print(f"  ✅ Loaded! (82M params, 10x faster than TinyLlama)")
+    def generate_answer(self, query: str, context: str, **kwargs) -> str:
+        """Generate with ultra-fast model"""
+        # Very simple prompt
+        prompt = f"Context: {context[:800]}\n\nQ: {query}\nA:"
+        inputs = self.tokenizer(
+            prompt,
+            return_tensors="pt",
+            truncation=True,
+            max_length=1000
+        ).to(self.device)
+        with torch.no_grad():
+            outputs = self.model.generate(
+                **inputs,
+                max_new_tokens=80,  # Very short
+                temperature=0.8,
+                do_sample=True,
+                top_p=0.9,
+                pad_token_id=self.tokenizer.eos_token_id
+            )
+        response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
+        answer = response.replace(prompt, "").strip()
+        return answer if answer else "Based on the information provided."
+# ============================================================================
+# TEST
+# ============================================================================
+if __name__ == "__main__":
+    print("="*70)
+    print("Testing Optimized LLM Generators")
+    print("="*70)
+    test_context = """Program: B.Sc. in Computer Science Engineering (CSE)
+Total Tuition Fee: 634,500 BDT
+Total Credits: 141
+Fee Per Credit: 4,500 BDT
+Application Deadline: August 25, 2025
+Admission Test Date: August 30, 2025"""
+    test_query = "How much does the CSE program cost?"
+    # Test 1: Optimized TinyLlama
+    print("\n" + "="*70)
+    print("TEST 1: Optimized TinyLlama")
+    print("="*70)
+    try:
+        generator = LLMGenerator(
+            model_name="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+            device="cpu",
+            max_new_tokens=100  # Short responses
+        )
+        answer = generator.generate_answer(test_query, test_context)
+        print(f"\n✅ Answer: {answer}\n")
+    except Exception as e:
+        print(f"❌ Error: {e}")
+    # Test 2: Ultra-fast DistilGPT-2
+    print("\n" + "="*70)
+    print("TEST 2: Ultra-Fast DistilGPT-2")
+    print("="*70)
+    try:
+        fast_gen = FastLLMGenerator(model_name="distilgpt2", device="cpu")
+        answer = fast_gen.generate_answer(test_query, test_context)
+        print(f"\n✅ Answer: {answer}\n")
+    except Exception as e:
+        print(f"❌ Error: {e}")
+    print("="*70)
+    print("✅ All tests completed!")
+    print("="*70)