Spaces:

nivakaran
/

NewFreeRag

Sleeping

@@ -1,7 +1,7 @@
 # HuggingFace Spaces Dockerfile for FreeRAG
-# Optimized for fast builds with pre-compiled wheels
-FROM python:3.11-slim
 # Set environment variables
 ENV PYTHONDONTWRITEBYTECODE=1 \
@@ -9,16 +9,12 @@ ENV PYTHONDONTWRITEBYTECODE=1 \
     PIP_NO_CACHE_DIR=1 \
     GRADIO_SERVER_NAME=0.0.0.0 \
     GRADIO_SERVER_PORT=7860 \
-    HF_HOME=/home/user/.cache/huggingface
 # Create non-root user (required by HuggingFace Spaces)
 RUN useradd -m -u 1000 user
-# Install minimal system dependencies (no build tools needed)
-RUN apt-get update && apt-get install -y --no-install-recommends \
-    curl \
-    && rm -rf /var/lib/apt/lists/*
 USER user
 WORKDIR /home/user/app
@@ -28,12 +24,8 @@ RUN mkdir -p /home/user/.cache/huggingface
 # Copy requirements
 COPY --chown=user:user requirements.txt .
-# Install Python dependencies using pre-built wheels only
-# Use CPU-only llama-cpp-python wheel (no compilation needed!)
 RUN pip install --user --upgrade pip && \
-    pip install --user \
-    llama-cpp-python \
-    --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cpu && \
     pip install --user -r requirements.txt
 # Copy application code

 # HuggingFace Spaces Dockerfile for FreeRAG
+# Uses HuggingFace Transformers - NO compilation required
+FROM python:3.10-slim
 # Set environment variables
 ENV PYTHONDONTWRITEBYTECODE=1 \
     PIP_NO_CACHE_DIR=1 \
     GRADIO_SERVER_NAME=0.0.0.0 \
     GRADIO_SERVER_PORT=7860 \
+    HF_HOME=/home/user/.cache/huggingface \
+    TRANSFORMERS_CACHE=/home/user/.cache/huggingface
 # Create non-root user (required by HuggingFace Spaces)
 RUN useradd -m -u 1000 user
 USER user
 WORKDIR /home/user/app
 # Copy requirements
 COPY --chown=user:user requirements.txt .
+# Install Python dependencies (all pre-built wheels, no compilation!)
 RUN pip install --user --upgrade pip && \
     pip install --user -r requirements.txt
 # Copy application code

requirements.txt CHANGED Viewed

@@ -1,6 +1,8 @@
 # Core Dependencies
 huggingface_hub>=0.20.0
-llama-cpp-python>=0.2.50
 # Embeddings
 sentence-transformers>=2.2.2

 # Core Dependencies
 huggingface_hub>=0.20.0
+transformers>=4.36.0
+accelerate>=0.25.0
+torch>=2.0.0
 # Embeddings
 sentence-transformers>=2.2.2

src/config.py CHANGED Viewed

@@ -7,8 +7,8 @@ from pathlib import Path
 @dataclass
 class ModelConfig:
     """LLM model configuration."""
-    repo_id: str = "Qwen/Qwen2-0.5B-Instruct-GGUF"
-    filename: str = "qwen2-0_5b-instruct-q4_k_m.gguf"  # ~400MB - very fast startup
     n_ctx: int = 2048
     n_threads: int = 2
     max_tokens: int = 256

 @dataclass
 class ModelConfig:
     """LLM model configuration."""
+    # Using Qwen2-0.5B from HuggingFace (no GGUF format needed)
+    repo_id: str = "Qwen/Qwen2-0.5B-Instruct"
     n_ctx: int = 2048
     n_threads: int = 2
     max_tokens: int = 256

src/llm/phi_model.py CHANGED Viewed

@@ -1,11 +1,11 @@
-"""Phi-3.5-mini model wrapper using llama-cpp-python."""
-from typing import Optional, List, Dict, Any
 import logging
 import sys
-from huggingface_hub import hf_hub_download
-from llama_cpp import Llama
 from src.config import ModelConfig
@@ -19,7 +19,7 @@ logger = logging.getLogger(__name__)
 class PhiModel:
-    """Wrapper for Phi-3.5-mini model."""
     def __init__(self, config: Optional[ModelConfig] = None):
         """Initialize the model wrapper.
@@ -28,54 +28,53 @@ class PhiModel:
             config: Model configuration. Uses defaults if not provided.
         """
         self.config = config or ModelConfig()
-        self._model: Optional[Llama] = None
-        self._model_path: Optional[str] = None
     @property
-    def model(self) -> Llama:
         """Lazy load the model."""
-        if self._model is None:
             self._load_model()
-        return self._model
     def _load_model(self) -> None:
         """Download and load the model with progress logging."""
-        import os
-        # Check for local model first
-        local_model_path = os.path.join("models", self.config.filename)
-        if os.path.exists(local_model_path):
-            logger.info(f"📂 Found local model: {local_model_path}")
-            self._model_path = local_model_path
-        else:
-            logger.info(f"📥 Downloading model: {self.config.filename}")
-            logger.info(f"   From: {self.config.repo_id}")
-            logger.info(f"   Size: ~400MB (Qwen2-0.5B)")
-            try:
-                self._model_path = hf_hub_download(
-                    repo_id=self.config.repo_id,
-                    filename=self.config.filename,
-                    resume_download=True,
-                )
-                logger.info(f"✅ Model downloaded to: {self._model_path}")
-            except Exception as e:
-                logger.error(f"❌ Model download failed: {e}")
-                raise
-        logger.info("🔧 Loading model into memory...")
-        logger.info(f"   Context: {self.config.n_ctx} tokens")
-        logger.info(f"   Threads: {self.config.n_threads}")
         try:
-            self._model = Llama(
-                model_path=self._model_path,
-                n_ctx=self.config.n_ctx,
-                n_threads=self.config.n_threads,
-                verbose=self.config.verbose
             )
             logger.info("✅ Model loaded successfully!")
         except Exception as e:
             logger.error(f"❌ Model loading failed: {e}")
             raise
@@ -90,13 +89,14 @@ class PhiModel:
         Returns:
             Generated text.
         """
-        output = self.model(
             prompt,
-            max_tokens=max_tokens or self.config.max_tokens,
             temperature=self.config.temperature,
-            echo=False
         )
-        return output["choices"][0]["text"].strip()
     def chat(
         self,
@@ -112,12 +112,21 @@ class PhiModel:
         Returns:
             Assistant's response.
         """
-        output = self.model.create_chat_completion(
-            messages=messages,
-            max_tokens=max_tokens or self.config.max_tokens,
-            temperature=self.config.temperature
-        )
-        return output["choices"][0]["message"]["content"].strip()
     def chat_with_context(
         self,

+"""LLM model wrapper using HuggingFace Transformers."""
 import logging
 import sys
+from typing import Optional, List, Dict
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
 from src.config import ModelConfig
 class PhiModel:
+    """Wrapper for LLM model using HuggingFace Transformers."""
     def __init__(self, config: Optional[ModelConfig] = None):
         """Initialize the model wrapper.
             config: Model configuration. Uses defaults if not provided.
         """
         self.config = config or ModelConfig()
+        self._model = None
+        self._tokenizer = None
+        self._pipeline = None
     @property
+    def model(self):
         """Lazy load the model."""
+        if self._pipeline is None:
             self._load_model()
+        return self._pipeline
     def _load_model(self) -> None:
         """Download and load the model with progress logging."""
+        logger.info(f"📥 Loading model: {self.config.repo_id}")
+        logger.info(f"   This may take a few minutes on first run...")
         try:
+            # Load tokenizer
+            logger.info("🔧 Loading tokenizer...")
+            self._tokenizer = AutoTokenizer.from_pretrained(
+                self.config.repo_id,
+                trust_remote_code=True
             )
+            # Load model with CPU optimizations
+            logger.info("🔧 Loading model weights...")
+            self._model = AutoModelForCausalLM.from_pretrained(
+                self.config.repo_id,
+                torch_dtype=torch.float32,
+                device_map="cpu",
+                trust_remote_code=True,
+                low_cpu_mem_usage=True
+            )
+            # Create pipeline for text generation
+            self._pipeline = pipeline(
+                "text-generation",
+                model=self._model,
+                tokenizer=self._tokenizer,
+                max_new_tokens=self.config.max_tokens,
+                temperature=self.config.temperature,
+                do_sample=True,
+                pad_token_id=self._tokenizer.eos_token_id
+            )
             logger.info("✅ Model loaded successfully!")
         except Exception as e:
             logger.error(f"❌ Model loading failed: {e}")
             raise
         Returns:
             Generated text.
         """
+        result = self.model(
             prompt,
+            max_new_tokens=max_tokens or self.config.max_tokens,
             temperature=self.config.temperature,
+            do_sample=True,
+            return_full_text=False
         )
+        return result[0]["generated_text"].strip()
     def chat(
         self,
         Returns:
             Assistant's response.
         """
+        # Format messages for chat
+        chat_text = ""
+        for msg in messages:
+            role = msg["role"]
+            content = msg["content"]
+            if role == "system":
+                chat_text += f"System: {content}\n\n"
+            elif role == "user":
+                chat_text += f"User: {content}\n\n"
+            elif role == "assistant":
+                chat_text += f"Assistant: {content}\n\n"
+        chat_text += "Assistant: "
+        return self.generate(chat_text, max_tokens)
     def chat_with_context(
         self,