nivakaran commited on
Commit
d6ab240
Β·
verified Β·
1 Parent(s): 4205d3d

Upload folder using huggingface_hub

Browse files
Files changed (6) hide show
  1. .gitattributes +0 -1
  2. .gitignore +3 -1
  3. Dockerfile +5 -13
  4. requirements.txt +3 -1
  5. src/config.py +2 -2
  6. src/llm/phi_model.py +61 -52
.gitattributes CHANGED
@@ -1 +0,0 @@
1
- *.gguf filter=lfs diff=lfs merge=lfs -text
 
 
.gitignore CHANGED
@@ -31,6 +31,8 @@ data/
31
  .DS_Store
32
  Thumbs.db
33
 
34
- # Large files (excluding models folder which we want to deploy)
 
 
35
  *.safetensors
36
  models/.cache/
 
31
  .DS_Store
32
  Thumbs.db
33
 
34
+ # Large files (not needed with Transformers - downloads automatically)
35
+ models/
36
+ *.gguf
37
  *.safetensors
38
  models/.cache/
Dockerfile CHANGED
@@ -1,7 +1,7 @@
1
  # HuggingFace Spaces Dockerfile for FreeRAG
2
- # Optimized for fast builds with pre-compiled wheels
3
 
4
- FROM python:3.11-slim
5
 
6
  # Set environment variables
7
  ENV PYTHONDONTWRITEBYTECODE=1 \
@@ -9,16 +9,12 @@ ENV PYTHONDONTWRITEBYTECODE=1 \
9
  PIP_NO_CACHE_DIR=1 \
10
  GRADIO_SERVER_NAME=0.0.0.0 \
11
  GRADIO_SERVER_PORT=7860 \
12
- HF_HOME=/home/user/.cache/huggingface
 
13
 
14
  # Create non-root user (required by HuggingFace Spaces)
15
  RUN useradd -m -u 1000 user
16
 
17
- # Install minimal system dependencies (no build tools needed)
18
- RUN apt-get update && apt-get install -y --no-install-recommends \
19
- curl \
20
- && rm -rf /var/lib/apt/lists/*
21
-
22
  USER user
23
  WORKDIR /home/user/app
24
 
@@ -28,12 +24,8 @@ RUN mkdir -p /home/user/.cache/huggingface
28
  # Copy requirements
29
  COPY --chown=user:user requirements.txt .
30
 
31
- # Install Python dependencies using pre-built wheels only
32
- # Use CPU-only llama-cpp-python wheel (no compilation needed!)
33
  RUN pip install --user --upgrade pip && \
34
- pip install --user \
35
- llama-cpp-python \
36
- --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cpu && \
37
  pip install --user -r requirements.txt
38
 
39
  # Copy application code
 
1
  # HuggingFace Spaces Dockerfile for FreeRAG
2
+ # Uses HuggingFace Transformers - NO compilation required
3
 
4
+ FROM python:3.10-slim
5
 
6
  # Set environment variables
7
  ENV PYTHONDONTWRITEBYTECODE=1 \
 
9
  PIP_NO_CACHE_DIR=1 \
10
  GRADIO_SERVER_NAME=0.0.0.0 \
11
  GRADIO_SERVER_PORT=7860 \
12
+ HF_HOME=/home/user/.cache/huggingface \
13
+ TRANSFORMERS_CACHE=/home/user/.cache/huggingface
14
 
15
  # Create non-root user (required by HuggingFace Spaces)
16
  RUN useradd -m -u 1000 user
17
 
 
 
 
 
 
18
  USER user
19
  WORKDIR /home/user/app
20
 
 
24
  # Copy requirements
25
  COPY --chown=user:user requirements.txt .
26
 
27
+ # Install Python dependencies (all pre-built wheels, no compilation!)
 
28
  RUN pip install --user --upgrade pip && \
 
 
 
29
  pip install --user -r requirements.txt
30
 
31
  # Copy application code
requirements.txt CHANGED
@@ -1,6 +1,8 @@
1
  # Core Dependencies
2
  huggingface_hub>=0.20.0
3
- llama-cpp-python>=0.2.50
 
 
4
 
5
  # Embeddings
6
  sentence-transformers>=2.2.2
 
1
  # Core Dependencies
2
  huggingface_hub>=0.20.0
3
+ transformers>=4.36.0
4
+ accelerate>=0.25.0
5
+ torch>=2.0.0
6
 
7
  # Embeddings
8
  sentence-transformers>=2.2.2
src/config.py CHANGED
@@ -7,8 +7,8 @@ from pathlib import Path
7
  @dataclass
8
  class ModelConfig:
9
  """LLM model configuration."""
10
- repo_id: str = "Qwen/Qwen2-0.5B-Instruct-GGUF"
11
- filename: str = "qwen2-0_5b-instruct-q4_k_m.gguf" # ~400MB - very fast startup
12
  n_ctx: int = 2048
13
  n_threads: int = 2
14
  max_tokens: int = 256
 
7
  @dataclass
8
  class ModelConfig:
9
  """LLM model configuration."""
10
+ # Using Qwen2-0.5B from HuggingFace (no GGUF format needed)
11
+ repo_id: str = "Qwen/Qwen2-0.5B-Instruct"
12
  n_ctx: int = 2048
13
  n_threads: int = 2
14
  max_tokens: int = 256
src/llm/phi_model.py CHANGED
@@ -1,11 +1,11 @@
1
- """Phi-3.5-mini model wrapper using llama-cpp-python."""
2
 
3
- from typing import Optional, List, Dict, Any
4
  import logging
5
  import sys
 
6
 
7
- from huggingface_hub import hf_hub_download
8
- from llama_cpp import Llama
9
 
10
  from src.config import ModelConfig
11
 
@@ -19,7 +19,7 @@ logger = logging.getLogger(__name__)
19
 
20
 
21
  class PhiModel:
22
- """Wrapper for Phi-3.5-mini model."""
23
 
24
  def __init__(self, config: Optional[ModelConfig] = None):
25
  """Initialize the model wrapper.
@@ -28,54 +28,53 @@ class PhiModel:
28
  config: Model configuration. Uses defaults if not provided.
29
  """
30
  self.config = config or ModelConfig()
31
- self._model: Optional[Llama] = None
32
- self._model_path: Optional[str] = None
 
33
 
34
  @property
35
- def model(self) -> Llama:
36
  """Lazy load the model."""
37
- if self._model is None:
38
  self._load_model()
39
- return self._model
40
 
41
  def _load_model(self) -> None:
42
  """Download and load the model with progress logging."""
43
- import os
44
-
45
- # Check for local model first
46
- local_model_path = os.path.join("models", self.config.filename)
47
-
48
- if os.path.exists(local_model_path):
49
- logger.info(f"πŸ“‚ Found local model: {local_model_path}")
50
- self._model_path = local_model_path
51
- else:
52
- logger.info(f"πŸ“₯ Downloading model: {self.config.filename}")
53
- logger.info(f" From: {self.config.repo_id}")
54
- logger.info(f" Size: ~400MB (Qwen2-0.5B)")
55
-
56
- try:
57
- self._model_path = hf_hub_download(
58
- repo_id=self.config.repo_id,
59
- filename=self.config.filename,
60
- resume_download=True,
61
- )
62
- logger.info(f"βœ… Model downloaded to: {self._model_path}")
63
- except Exception as e:
64
- logger.error(f"❌ Model download failed: {e}")
65
- raise
66
-
67
- logger.info("πŸ”§ Loading model into memory...")
68
- logger.info(f" Context: {self.config.n_ctx} tokens")
69
- logger.info(f" Threads: {self.config.n_threads}")
70
 
71
  try:
72
- self._model = Llama(
73
- model_path=self._model_path,
74
- n_ctx=self.config.n_ctx,
75
- n_threads=self.config.n_threads,
76
- verbose=self.config.verbose
77
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
78
  logger.info("βœ… Model loaded successfully!")
 
79
  except Exception as e:
80
  logger.error(f"❌ Model loading failed: {e}")
81
  raise
@@ -90,13 +89,14 @@ class PhiModel:
90
  Returns:
91
  Generated text.
92
  """
93
- output = self.model(
94
  prompt,
95
- max_tokens=max_tokens or self.config.max_tokens,
96
  temperature=self.config.temperature,
97
- echo=False
 
98
  )
99
- return output["choices"][0]["text"].strip()
100
 
101
  def chat(
102
  self,
@@ -112,12 +112,21 @@ class PhiModel:
112
  Returns:
113
  Assistant's response.
114
  """
115
- output = self.model.create_chat_completion(
116
- messages=messages,
117
- max_tokens=max_tokens or self.config.max_tokens,
118
- temperature=self.config.temperature
119
- )
120
- return output["choices"][0]["message"]["content"].strip()
 
 
 
 
 
 
 
 
 
121
 
122
  def chat_with_context(
123
  self,
 
1
+ """LLM model wrapper using HuggingFace Transformers."""
2
 
 
3
  import logging
4
  import sys
5
+ from typing import Optional, List, Dict
6
 
7
+ import torch
8
+ from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
9
 
10
  from src.config import ModelConfig
11
 
 
19
 
20
 
21
  class PhiModel:
22
+ """Wrapper for LLM model using HuggingFace Transformers."""
23
 
24
  def __init__(self, config: Optional[ModelConfig] = None):
25
  """Initialize the model wrapper.
 
28
  config: Model configuration. Uses defaults if not provided.
29
  """
30
  self.config = config or ModelConfig()
31
+ self._model = None
32
+ self._tokenizer = None
33
+ self._pipeline = None
34
 
35
  @property
36
+ def model(self):
37
  """Lazy load the model."""
38
+ if self._pipeline is None:
39
  self._load_model()
40
+ return self._pipeline
41
 
42
  def _load_model(self) -> None:
43
  """Download and load the model with progress logging."""
44
+ logger.info(f"πŸ“₯ Loading model: {self.config.repo_id}")
45
+ logger.info(f" This may take a few minutes on first run...")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
 
47
  try:
48
+ # Load tokenizer
49
+ logger.info("πŸ”§ Loading tokenizer...")
50
+ self._tokenizer = AutoTokenizer.from_pretrained(
51
+ self.config.repo_id,
52
+ trust_remote_code=True
53
  )
54
+
55
+ # Load model with CPU optimizations
56
+ logger.info("πŸ”§ Loading model weights...")
57
+ self._model = AutoModelForCausalLM.from_pretrained(
58
+ self.config.repo_id,
59
+ torch_dtype=torch.float32,
60
+ device_map="cpu",
61
+ trust_remote_code=True,
62
+ low_cpu_mem_usage=True
63
+ )
64
+
65
+ # Create pipeline for text generation
66
+ self._pipeline = pipeline(
67
+ "text-generation",
68
+ model=self._model,
69
+ tokenizer=self._tokenizer,
70
+ max_new_tokens=self.config.max_tokens,
71
+ temperature=self.config.temperature,
72
+ do_sample=True,
73
+ pad_token_id=self._tokenizer.eos_token_id
74
+ )
75
+
76
  logger.info("βœ… Model loaded successfully!")
77
+
78
  except Exception as e:
79
  logger.error(f"❌ Model loading failed: {e}")
80
  raise
 
89
  Returns:
90
  Generated text.
91
  """
92
+ result = self.model(
93
  prompt,
94
+ max_new_tokens=max_tokens or self.config.max_tokens,
95
  temperature=self.config.temperature,
96
+ do_sample=True,
97
+ return_full_text=False
98
  )
99
+ return result[0]["generated_text"].strip()
100
 
101
  def chat(
102
  self,
 
112
  Returns:
113
  Assistant's response.
114
  """
115
+ # Format messages for chat
116
+ chat_text = ""
117
+ for msg in messages:
118
+ role = msg["role"]
119
+ content = msg["content"]
120
+ if role == "system":
121
+ chat_text += f"System: {content}\n\n"
122
+ elif role == "user":
123
+ chat_text += f"User: {content}\n\n"
124
+ elif role == "assistant":
125
+ chat_text += f"Assistant: {content}\n\n"
126
+
127
+ chat_text += "Assistant: "
128
+
129
+ return self.generate(chat_text, max_tokens)
130
 
131
  def chat_with_context(
132
  self,