FreeRag / src /llm /phi_model.py
GitHub Actions
Deploy from GitHub Actions
c9622da
"""Phi-3.5-mini model wrapper using llama-cpp-python."""
from typing import Optional, List, Dict, Any
from huggingface_hub import hf_hub_download
from llama_cpp import Llama
from src.config import ModelConfig
class PhiModel:
"""Wrapper for Phi-3.5-mini model."""
def __init__(self, config: Optional[ModelConfig] = None):
"""Initialize the model wrapper.
Args:
config: Model configuration. Uses defaults if not provided.
"""
self.config = config or ModelConfig()
self._model: Optional[Llama] = None
self._model_path: Optional[str] = None
@property
def model(self) -> Llama:
"""Lazy load the model."""
if self._model is None:
self._load_model()
return self._model
def _load_model(self) -> None:
"""Download and load the model."""
print(f"Downloading model from {self.config.repo_id}...")
self._model_path = hf_hub_download(
repo_id=self.config.repo_id,
filename=self.config.filename
)
print("Loading model into memory...")
self._model = Llama(
model_path=self._model_path,
n_ctx=self.config.n_ctx,
n_threads=self.config.n_threads,
verbose=self.config.verbose
)
print("Model loaded successfully!")
def generate(self, prompt: str, max_tokens: Optional[int] = None) -> str:
"""Generate text completion.
Args:
prompt: Input prompt.
max_tokens: Maximum tokens to generate.
Returns:
Generated text.
"""
output = self.model(
prompt,
max_tokens=max_tokens or self.config.max_tokens,
temperature=self.config.temperature,
echo=False
)
return output["choices"][0]["text"].strip()
def chat(
self,
messages: List[Dict[str, str]],
max_tokens: Optional[int] = None
) -> str:
"""Generate chat completion.
Args:
messages: List of message dicts with 'role' and 'content'.
max_tokens: Maximum tokens to generate.
Returns:
Assistant's response.
"""
output = self.model.create_chat_completion(
messages=messages,
max_tokens=max_tokens or self.config.max_tokens,
temperature=self.config.temperature
)
return output["choices"][0]["message"]["content"].strip()
def chat_with_context(
self,
query: str,
context: str,
system_prompt: Optional[str] = None
) -> str:
"""Generate response with RAG context.
Args:
query: User's question.
context: Retrieved context from documents.
system_prompt: Optional system prompt.
Returns:
Generated response.
"""
if system_prompt is None:
system_prompt = (
"You are a helpful assistant. Answer the user's question based on "
"the provided context. If the context doesn't contain relevant "
"information, say so honestly. Be concise and accurate."
)
user_message = f"""Context:
{context}
Question: {query}
Please answer based on the context provided above."""
messages = [
{"role": "system", "content": system_prompt},
{"role": "user", "content": user_message}
]
return self.chat(messages)