LeonardoMdSA's picture
push to Spaces
5366fc0
from pathlib import Path
from typing import List, Optional
import yaml
import numpy as np
try:
from llama_cpp import Llama
except Exception:
Llama = None
try:
from sentence_transformers import SentenceTransformer
except Exception:
SentenceTransformer = None
# Load config
CFG_PATH = Path(__file__).resolve().parent.parent.parent / "configs" / "default.yaml"
if CFG_PATH.exists():
with open(CFG_PATH, "r") as f:
_CFG = yaml.safe_load(f)
else:
_CFG = {
"model_path": "models/qwen2.5-0.5b-instruct-q4_0.gguf",
"embed_model": "sentence-transformers/all-MiniLM-L6-v2",
"faiss_dir": "faiss_index",
"chunk_size": 1000,
"chunk_overlap": 200
}
class ModelLoader:
def __init__(
self,
model_path: Optional[str] = None,
embed_model_name: Optional[str] = None,
faiss_dir: Optional[str] = None,
n_ctx: int = 2048, # 0.5B models cannot handle 4k context well
):
self.model_path = Path(model_path or _CFG.get("model_path"))
self.embed_model_name = embed_model_name or _CFG.get("embed_model")
self.faiss_dir = Path(faiss_dir or _CFG.get("faiss_dir"))
self.n_ctx = n_ctx
self.llm = None
self.embedder = None
self.index = None
self.documents: List[str] = []
self._load_all()
def _load_llm(self):
if not self.model_path.exists():
print(f"[WARN] LLM model not found: {self.model_path}")
return None
if Llama is None:
print("[WARN] llama-cpp-python missing.")
return None
print(f"[INFO] Loading local LLM: {self.model_path}")
return Llama(
model_path=str(self.model_path),
n_ctx=self.n_ctx,
n_threads=4,
n_gpu_layers=0
)
def _load_embedder(self):
if SentenceTransformer is None:
print("[WARN] sentence-transformers missing.")
return None
print(f"[INFO] Loading embedder: {self.embed_model_name}")
return SentenceTransformer(self.embed_model_name)
def _load_all(self):
self.llm = self._load_llm()
self.embedder = self._load_embedder()
self.index = None
def embed(self, texts: List[str]):
if self.embedder is None:
raise RuntimeError("Embedder is missing.")
return self.embedder.encode(texts, show_progress_bar=False)
def chat(self, prompt: str, max_tokens: int = 256) -> str:
if not self.llm:
return "[Local LLM missing — place a .gguf model inside models/]"
out = self.llm(
prompt,
max_tokens=max_tokens,
temperature=0.7,
top_p=0.9,
echo=False
)
try:
return out["choices"][0]["text"].strip()
except Exception:
return str(out)
def answer_from_rag(self, query: str, max_tokens: int = 256) -> str:
return self.chat(query, max_tokens=max_tokens)