Upload folder using huggingface_hub
Browse files- .gitattributes +1 -35
- .gitignore +17 -0
- CONTRIBUTING.md +17 -0
- Dockerfile +31 -0
- LICENSE +21 -0
- README.md +118 -3
- app/__init__.py +0 -0
- app/ingest.py +94 -0
- app/main.py +99 -0
- app/rag.py +58 -0
- app/settings.py +24 -0
- requirements.txt +9 -0
- scripts/download_model.py +28 -0
- scripts/start.sh +13 -0
.gitattributes
CHANGED
|
@@ -1,35 +1 @@
|
|
| 1 |
-
*.
|
| 2 |
-
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
-
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
-
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
-
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
-
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
-
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
-
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
-
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
-
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
-
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
-
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
-
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
-
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
-
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
-
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
-
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
-
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
-
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
-
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
-
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
-
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
-
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
-
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
-
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
-
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
-
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
-
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 29 |
-
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 30 |
-
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 31 |
-
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 32 |
-
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 33 |
-
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
-
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
-
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
| 1 |
+
*.gguf filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
.gitignore
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Python
|
| 2 |
+
.venv/
|
| 3 |
+
__pycache__/
|
| 4 |
+
*.pyc
|
| 5 |
+
|
| 6 |
+
# Local models / data
|
| 7 |
+
models/
|
| 8 |
+
data/index.faiss
|
| 9 |
+
data/docstore.json
|
| 10 |
+
|
| 11 |
+
# Azure local config
|
| 12 |
+
.azure-config/
|
| 13 |
+
|
| 14 |
+
# OS / editor
|
| 15 |
+
.DS_Store
|
| 16 |
+
.vscode/
|
| 17 |
+
.idea/
|
CONTRIBUTING.md
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Contributing
|
| 2 |
+
|
| 3 |
+
Thanks for considering a contribution.
|
| 4 |
+
|
| 5 |
+
## Quick guidelines
|
| 6 |
+
- Keep changes focused and minimal.
|
| 7 |
+
- Run tests or a basic smoke test when possible.
|
| 8 |
+
- Update docs if behavior changes.
|
| 9 |
+
|
| 10 |
+
## Pull requests
|
| 11 |
+
1. Fork and create a feature branch.
|
| 12 |
+
2. Make changes with clear commit messages.
|
| 13 |
+
3. Open a PR describing what/why and how to test.
|
| 14 |
+
|
| 15 |
+
## Issues
|
| 16 |
+
- Provide steps to reproduce.
|
| 17 |
+
- Include logs or error traces when relevant.
|
Dockerfile
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM python:3.11-slim
|
| 2 |
+
|
| 3 |
+
WORKDIR /app
|
| 4 |
+
|
| 5 |
+
RUN apt-get update \
|
| 6 |
+
&& apt-get install -y --no-install-recommends build-essential \
|
| 7 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 8 |
+
|
| 9 |
+
# Build llama-cpp without -march=native to avoid illegal instruction on weaker CPUs
|
| 10 |
+
ENV CMAKE_ARGS="-DLLAMA_NATIVE=OFF" \
|
| 11 |
+
FORCE_CMAKE=1
|
| 12 |
+
|
| 13 |
+
COPY requirements.txt ./
|
| 14 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
| 15 |
+
|
| 16 |
+
COPY app ./app
|
| 17 |
+
COPY scripts ./scripts
|
| 18 |
+
COPY data ./data
|
| 19 |
+
|
| 20 |
+
RUN chmod +x /app/scripts/start.sh
|
| 21 |
+
|
| 22 |
+
ENV MODEL_PATH="/models/Phi-3-mini-4k-instruct-q4.gguf" \
|
| 23 |
+
N_THREADS="4" \
|
| 24 |
+
N_GPU_LAYERS="0" \
|
| 25 |
+
N_CTX="4096" \
|
| 26 |
+
RAG_TOP_K="4" \
|
| 27 |
+
APP_PORT="8000"
|
| 28 |
+
|
| 29 |
+
EXPOSE 8000
|
| 30 |
+
|
| 31 |
+
CMD ["/bin/sh", "/app/scripts/start.sh"]
|
LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
MIT License
|
| 2 |
+
|
| 3 |
+
Copyright (c) 2026 Sekponakokou
|
| 4 |
+
|
| 5 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
| 6 |
+
of this software and associated documentation files (the "Software"), to deal
|
| 7 |
+
in the Software without restriction, including without limitation the rights
|
| 8 |
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
| 9 |
+
copies of the Software, and to permit persons to whom the Software is
|
| 10 |
+
furnished to do so, subject to the following conditions:
|
| 11 |
+
|
| 12 |
+
The above copyright notice and this permission notice shall be included in all
|
| 13 |
+
copies or substantial portions of the Software.
|
| 14 |
+
|
| 15 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
| 16 |
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
| 17 |
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
| 18 |
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
| 19 |
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
| 20 |
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
| 21 |
+
SOFTWARE.
|
README.md
CHANGED
|
@@ -1,3 +1,118 @@
|
|
| 1 |
-
|
| 2 |
-
|
| 3 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Quantized LLM + RAG (FastAPI + FAISS + Phi‑3)
|
| 2 |
+
|
| 3 |
+
## Goal
|
| 4 |
+
Deploy a small, low‑cost LLM with 4‑bit quantization + RAG, exposed via a clean FastAPI service that can run on CPU‑only servers (e.g., Azure Container Instances).
|
| 5 |
+
|
| 6 |
+
FastAPI API serving a 4‑bit GGUF LLM with a lightweight FAISS RAG pipeline. Designed for low‑cost CPU servers (Azure Container Instances) and local Mac testing.
|
| 7 |
+
|
| 8 |
+
## Features
|
| 9 |
+
- 4‑bit quantized Phi‑3 GGUF (llama.cpp via `llama-cpp-python`)
|
| 10 |
+
- Simple RAG with FAISS (cosine similarity)
|
| 11 |
+
- Wikipedia public-source ingestion (replaceable)
|
| 12 |
+
- Docker image ready for ACI
|
| 13 |
+
|
| 14 |
+
## Repo structure
|
| 15 |
+
```
|
| 16 |
+
app/
|
| 17 |
+
main.py # FastAPI app
|
| 18 |
+
rag.py # FAISS utilities
|
| 19 |
+
ingest.py # build index from public sources
|
| 20 |
+
settings.py # config via env
|
| 21 |
+
scripts/
|
| 22 |
+
download_model.py
|
| 23 |
+
Dockerfile
|
| 24 |
+
requirements.txt
|
| 25 |
+
```
|
| 26 |
+
|
| 27 |
+
## Local dev (Mac)
|
| 28 |
+
|
| 29 |
+
```bash
|
| 30 |
+
python3.12 -m venv .venv
|
| 31 |
+
. .venv/bin/activate
|
| 32 |
+
pip install -r requirements.txt
|
| 33 |
+
|
| 34 |
+
# Download 4-bit Phi-3 GGUF
|
| 35 |
+
python scripts/download_model.py \
|
| 36 |
+
--repo microsoft/Phi-3-mini-4k-instruct-gguf \
|
| 37 |
+
--filename Phi-3-mini-4k-instruct-q4.gguf \
|
| 38 |
+
--out models
|
| 39 |
+
|
| 40 |
+
# Build FAISS index from public pages
|
| 41 |
+
python -m app.ingest --pages "Large_language_model,Azure,Quantization_(signal_processing)" --lang en
|
| 42 |
+
|
| 43 |
+
# Run API
|
| 44 |
+
export MODEL_PATH="models/Phi-3-mini-4k-instruct-q4.gguf"
|
| 45 |
+
export N_GPU_LAYERS="-1" # Metal offload on Mac
|
| 46 |
+
uvicorn app.main:app --host 0.0.0.0 --port 8000
|
| 47 |
+
```
|
| 48 |
+
|
| 49 |
+
Test:
|
| 50 |
+
```bash
|
| 51 |
+
curl http://localhost:8000/health
|
| 52 |
+
curl -X POST http://localhost:8000/chat \
|
| 53 |
+
-H "Content-Type: application/json" \
|
| 54 |
+
-d '{"question":"What is quantization in signal processing?"}'
|
| 55 |
+
```
|
| 56 |
+
|
| 57 |
+
## Docker (local)
|
| 58 |
+
|
| 59 |
+
Build:
|
| 60 |
+
```bash
|
| 61 |
+
docker build -t quant-llm .
|
| 62 |
+
```
|
| 63 |
+
|
| 64 |
+
Run:
|
| 65 |
+
```bash
|
| 66 |
+
docker run --rm -p 8000:8000 \
|
| 67 |
+
-e MODEL_PATH=/models/Phi-3-mini-4k-instruct-q4.gguf \
|
| 68 |
+
-v "$PWD/models:/models" \
|
| 69 |
+
quant-llm
|
| 70 |
+
```
|
| 71 |
+
|
| 72 |
+
## Azure Container Instances (ACI)
|
| 73 |
+
|
| 74 |
+
1) Build + push to ACR:
|
| 75 |
+
```bash
|
| 76 |
+
az group create -n rg-quant-llm -l westeurope
|
| 77 |
+
az acr create -n acrquantllm -g rg-quant-llm --sku Basic
|
| 78 |
+
az acr login -n acrquantllm
|
| 79 |
+
az acr build -t quant-llm:1 -r acrquantllm .
|
| 80 |
+
```
|
| 81 |
+
|
| 82 |
+
2) Run in ACI (downloads model at startup):
|
| 83 |
+
```bash
|
| 84 |
+
az container create \
|
| 85 |
+
-g rg-quant-llm \
|
| 86 |
+
-n quant-llm-api \
|
| 87 |
+
--image acrquantllm.azurecr.io/quant-llm:1 \
|
| 88 |
+
--registry-login-server acrquantllm.azurecr.io \
|
| 89 |
+
--registry-username <ACR_USERNAME> \
|
| 90 |
+
--registry-password <ACR_PASSWORD> \
|
| 91 |
+
--cpu 2 --memory 6 \
|
| 92 |
+
--ports 8000 \
|
| 93 |
+
--environment-variables MODEL_PATH=/models/Phi-3-mini-4k-instruct-q4.gguf N_THREADS=2 N_GPU_LAYERS=0 \
|
| 94 |
+
--command-line "bash -lc 'python scripts/download_model.py --repo microsoft/Phi-3-mini-4k-instruct-gguf --filename Phi-3-mini-4k-instruct-q4.gguf --out /models && uvicorn app.main:app --host 0.0.0.0 --port 8000'"
|
| 95 |
+
```
|
| 96 |
+
|
| 97 |
+
3) Get public IP:
|
| 98 |
+
```bash
|
| 99 |
+
az container show -g rg-quant-llm -n quant-llm-api --query ipAddress.ip -o tsv
|
| 100 |
+
```
|
| 101 |
+
|
| 102 |
+
## Config
|
| 103 |
+
Environment variables in `app/settings.py`:
|
| 104 |
+
- `MODEL_PATH` (default: `models/phi-3-mini-4k-instruct-q4.gguf`)
|
| 105 |
+
- `N_CTX` (default: 4096)
|
| 106 |
+
- `N_THREADS` (default: 8)
|
| 107 |
+
- `N_GPU_LAYERS` (default: 0, use `-1` on Mac for Metal)
|
| 108 |
+
- `RAG_TOP_K` (default: 4)
|
| 109 |
+
|
| 110 |
+
## Notes
|
| 111 |
+
- 4‑bit GGUF is the best CPU-friendly option for cost/memory.
|
| 112 |
+
- RAG sources are currently Wikipedia; swap `app/ingest.py` to your own docs.
|
| 113 |
+
|
| 114 |
+
## Contributing
|
| 115 |
+
See `CONTRIBUTING.md`.
|
| 116 |
+
|
| 117 |
+
## License
|
| 118 |
+
MIT. See `LICENSE`.
|
app/__init__.py
ADDED
|
File without changes
|
app/ingest.py
ADDED
|
@@ -0,0 +1,94 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import argparse
|
| 2 |
+
import re
|
| 3 |
+
from typing import List, Dict
|
| 4 |
+
|
| 5 |
+
import requests
|
| 6 |
+
from sentence_transformers import SentenceTransformer
|
| 7 |
+
|
| 8 |
+
from .rag import embed_texts, build_faiss_index, save_faiss_index, save_docstore
|
| 9 |
+
from .settings import settings
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
def fetch_wikipedia_page(title: str, lang: str = "en") -> str:
|
| 13 |
+
url = f"https://{lang}.wikipedia.org/w/api.php"
|
| 14 |
+
headers = {"User-Agent": "quantized-rag/0.1 (local test; contact: dev@example.com)"}
|
| 15 |
+
params = {
|
| 16 |
+
"action": "query",
|
| 17 |
+
"prop": "extracts",
|
| 18 |
+
"explaintext": 1,
|
| 19 |
+
"titles": title,
|
| 20 |
+
"format": "json",
|
| 21 |
+
}
|
| 22 |
+
resp = requests.get(url, headers=headers, params=params, timeout=30)
|
| 23 |
+
resp.raise_for_status()
|
| 24 |
+
data = resp.json()
|
| 25 |
+
pages = data.get("query", {}).get("pages", {})
|
| 26 |
+
if not pages:
|
| 27 |
+
return ""
|
| 28 |
+
page = next(iter(pages.values()))
|
| 29 |
+
return page.get("extract", "")
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
def chunk_text(text: str, chunk_size: int = 350, overlap: int = 40) -> List[str]:
|
| 33 |
+
words = re.findall(r"\S+", text)
|
| 34 |
+
chunks = []
|
| 35 |
+
start = 0
|
| 36 |
+
while start < len(words):
|
| 37 |
+
end = min(len(words), start + chunk_size)
|
| 38 |
+
chunk = " ".join(words[start:end])
|
| 39 |
+
chunks.append(chunk)
|
| 40 |
+
if end == len(words):
|
| 41 |
+
break
|
| 42 |
+
start = end - overlap
|
| 43 |
+
if start < 0:
|
| 44 |
+
start = 0
|
| 45 |
+
return chunks
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
def build_docs(titles: List[str], lang: str = "en") -> List[Dict]:
|
| 49 |
+
docs: List[Dict] = []
|
| 50 |
+
for title in titles:
|
| 51 |
+
text = fetch_wikipedia_page(title, lang=lang)
|
| 52 |
+
for i, chunk in enumerate(chunk_text(text)):
|
| 53 |
+
docs.append(
|
| 54 |
+
{
|
| 55 |
+
"id": f"{title}:{i}",
|
| 56 |
+
"title": title,
|
| 57 |
+
"source": f"https://{lang}.wikipedia.org/wiki/{title}",
|
| 58 |
+
"text": chunk,
|
| 59 |
+
}
|
| 60 |
+
)
|
| 61 |
+
return docs
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
def main() -> None:
|
| 65 |
+
parser = argparse.ArgumentParser(description="Build FAISS index from Wikipedia pages")
|
| 66 |
+
parser.add_argument(
|
| 67 |
+
"--pages",
|
| 68 |
+
required=True,
|
| 69 |
+
help="Comma-separated list of Wikipedia page titles, e.g. 'Azure,Large_language_model'",
|
| 70 |
+
)
|
| 71 |
+
parser.add_argument("--lang", default="en", help="Wikipedia language (default: en)")
|
| 72 |
+
parser.add_argument("--out-index", default=settings.faiss_index_path)
|
| 73 |
+
parser.add_argument("--out-docs", default=settings.docstore_path)
|
| 74 |
+
args = parser.parse_args()
|
| 75 |
+
|
| 76 |
+
titles = [p.strip().replace(" ", "_") for p in args.pages.split(",") if p.strip()]
|
| 77 |
+
if not titles:
|
| 78 |
+
raise SystemExit("No pages provided")
|
| 79 |
+
|
| 80 |
+
docs = build_docs(titles, lang=args.lang)
|
| 81 |
+
embedder = SentenceTransformer(settings.embed_model)
|
| 82 |
+
embeddings = embed_texts(embedder, [d["text"] for d in docs])
|
| 83 |
+
|
| 84 |
+
index = build_faiss_index(embeddings)
|
| 85 |
+
save_faiss_index(args.out_index, index)
|
| 86 |
+
save_docstore(args.out_docs, docs)
|
| 87 |
+
|
| 88 |
+
print(f"Saved {len(docs)} chunks")
|
| 89 |
+
print(f"Index: {args.out_index}")
|
| 90 |
+
print(f"Docstore: {args.out_docs}")
|
| 91 |
+
|
| 92 |
+
|
| 93 |
+
if __name__ == "__main__":
|
| 94 |
+
main()
|
app/main.py
ADDED
|
@@ -0,0 +1,99 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from pathlib import Path
|
| 2 |
+
from typing import List, Optional, Dict
|
| 3 |
+
|
| 4 |
+
from fastapi import FastAPI, HTTPException
|
| 5 |
+
from pydantic import BaseModel
|
| 6 |
+
from llama_cpp import Llama
|
| 7 |
+
from sentence_transformers import SentenceTransformer
|
| 8 |
+
|
| 9 |
+
from .settings import settings
|
| 10 |
+
from .rag import load_docstore, load_faiss_index, retrieve
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
class ChatRequest(BaseModel):
|
| 14 |
+
question: str
|
| 15 |
+
history: Optional[List[Dict[str, str]]] = None
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
class ChatResponse(BaseModel):
|
| 19 |
+
answer: str
|
| 20 |
+
sources: List[Dict[str, str]]
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
app = FastAPI(title="Quantized LLM + RAG")
|
| 24 |
+
|
| 25 |
+
llm: Optional[Llama] = None
|
| 26 |
+
embedder: Optional[SentenceTransformer] = None
|
| 27 |
+
rag_index = None
|
| 28 |
+
rag_docs = None
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
@app.on_event("startup")
|
| 32 |
+
def load_resources() -> None:
|
| 33 |
+
global llm, embedder, rag_index, rag_docs
|
| 34 |
+
|
| 35 |
+
model_path = Path(settings.model_path)
|
| 36 |
+
if not model_path.exists():
|
| 37 |
+
raise RuntimeError(
|
| 38 |
+
f"Model not found at {model_path}. Set MODEL_PATH env var or download a GGUF model."
|
| 39 |
+
)
|
| 40 |
+
|
| 41 |
+
llm = Llama(
|
| 42 |
+
model_path=str(model_path),
|
| 43 |
+
n_ctx=settings.n_ctx,
|
| 44 |
+
n_threads=settings.n_threads,
|
| 45 |
+
n_gpu_layers=settings.n_gpu_layers,
|
| 46 |
+
)
|
| 47 |
+
|
| 48 |
+
embedder = SentenceTransformer(settings.embed_model)
|
| 49 |
+
|
| 50 |
+
index_path = Path(settings.faiss_index_path)
|
| 51 |
+
docs_path = Path(settings.docstore_path)
|
| 52 |
+
if index_path.exists() and docs_path.exists():
|
| 53 |
+
rag_index = load_faiss_index(str(index_path))
|
| 54 |
+
rag_docs = load_docstore(str(docs_path))
|
| 55 |
+
else:
|
| 56 |
+
rag_index = None
|
| 57 |
+
rag_docs = None
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
@app.get("/health")
|
| 61 |
+
def health() -> Dict[str, str]:
|
| 62 |
+
return {"status": "ok"}
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
@app.post("/chat", response_model=ChatResponse)
|
| 66 |
+
def chat(req: ChatRequest) -> ChatResponse:
|
| 67 |
+
if llm is None or embedder is None:
|
| 68 |
+
raise HTTPException(status_code=500, detail="Model not loaded")
|
| 69 |
+
if not req.question.strip():
|
| 70 |
+
raise HTTPException(status_code=400, detail="Question is required")
|
| 71 |
+
|
| 72 |
+
context_blocks = []
|
| 73 |
+
sources: List[Dict[str, str]] = []
|
| 74 |
+
if rag_index is not None and rag_docs is not None:
|
| 75 |
+
results = retrieve(req.question, embedder, rag_index, rag_docs, settings.rag_top_k)
|
| 76 |
+
for doc, score in results:
|
| 77 |
+
context_blocks.append(f"[Source] {doc['text']}")
|
| 78 |
+
sources.append({"title": doc.get("title", ""), "source": doc.get("source", "")})
|
| 79 |
+
|
| 80 |
+
system_prompt = (
|
| 81 |
+
"You are a helpful assistant. Use the provided context to answer. "
|
| 82 |
+
"If the answer is not in the context, say you do not know."
|
| 83 |
+
)
|
| 84 |
+
context = "\n\n".join(context_blocks) if context_blocks else ""
|
| 85 |
+
|
| 86 |
+
prompt = f"<|system|>\n{system_prompt}\n<|user|>\n{req.question}\n"
|
| 87 |
+
if context:
|
| 88 |
+
prompt += f"<|context|>\n{context}\n"
|
| 89 |
+
prompt += "<|assistant|>\n"
|
| 90 |
+
|
| 91 |
+
output = llm(
|
| 92 |
+
prompt,
|
| 93 |
+
temperature=settings.temperature,
|
| 94 |
+
max_tokens=settings.max_tokens,
|
| 95 |
+
stop=["<|user|>", "<|assistant|>", "<|system|>", "</s>"],
|
| 96 |
+
)
|
| 97 |
+
|
| 98 |
+
answer = output["choices"][0]["text"].strip()
|
| 99 |
+
return ChatResponse(answer=answer, sources=sources)
|
app/rag.py
ADDED
|
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
from typing import List, Dict, Tuple
|
| 3 |
+
|
| 4 |
+
import faiss
|
| 5 |
+
import numpy as np
|
| 6 |
+
from sentence_transformers import SentenceTransformer
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
def _normalize(vecs: np.ndarray) -> np.ndarray:
|
| 10 |
+
norms = np.linalg.norm(vecs, axis=1, keepdims=True) + 1e-12
|
| 11 |
+
return vecs / norms
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
def embed_texts(model: SentenceTransformer, texts: List[str]) -> np.ndarray:
|
| 15 |
+
embeddings = model.encode(texts, batch_size=32, show_progress_bar=False)
|
| 16 |
+
embeddings = np.array(embeddings, dtype=np.float32)
|
| 17 |
+
return _normalize(embeddings)
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
def save_docstore(path: str, docs: List[Dict]) -> None:
|
| 21 |
+
with open(path, "w", encoding="utf-8") as f:
|
| 22 |
+
json.dump(docs, f, ensure_ascii=False, indent=2)
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
def load_docstore(path: str) -> List[Dict]:
|
| 26 |
+
with open(path, "r", encoding="utf-8") as f:
|
| 27 |
+
return json.load(f)
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
def build_faiss_index(embeddings: np.ndarray) -> faiss.IndexFlatIP:
|
| 31 |
+
index = faiss.IndexFlatIP(embeddings.shape[1])
|
| 32 |
+
index.add(embeddings)
|
| 33 |
+
return index
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
def save_faiss_index(path: str, index: faiss.IndexFlatIP) -> None:
|
| 37 |
+
faiss.write_index(index, path)
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
def load_faiss_index(path: str) -> faiss.IndexFlatIP:
|
| 41 |
+
return faiss.read_index(path)
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
def retrieve(
|
| 45 |
+
query: str,
|
| 46 |
+
model: SentenceTransformer,
|
| 47 |
+
index: faiss.IndexFlatIP,
|
| 48 |
+
docs: List[Dict],
|
| 49 |
+
top_k: int = 4,
|
| 50 |
+
) -> List[Tuple[Dict, float]]:
|
| 51 |
+
query_vec = embed_texts(model, [query])
|
| 52 |
+
scores, indices = index.search(query_vec, top_k)
|
| 53 |
+
results: List[Tuple[Dict, float]] = []
|
| 54 |
+
for idx, score in zip(indices[0], scores[0]):
|
| 55 |
+
if idx == -1:
|
| 56 |
+
continue
|
| 57 |
+
results.append((docs[int(idx)], float(score)))
|
| 58 |
+
return results
|
app/settings.py
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from pydantic_settings import BaseSettings
|
| 2 |
+
|
| 3 |
+
|
| 4 |
+
class Settings(BaseSettings):
|
| 5 |
+
# Model
|
| 6 |
+
model_path: str = "models/phi-3-mini-4k-instruct-q4.gguf"
|
| 7 |
+
n_ctx: int = 4096
|
| 8 |
+
n_threads: int = 8
|
| 9 |
+
n_gpu_layers: int = 0 # set -1 on Mac Metal to offload all layers
|
| 10 |
+
temperature: float = 0.2
|
| 11 |
+
max_tokens: int = 512
|
| 12 |
+
|
| 13 |
+
# RAG
|
| 14 |
+
embed_model: str = "sentence-transformers/all-MiniLM-L6-v2"
|
| 15 |
+
faiss_index_path: str = "data/index.faiss"
|
| 16 |
+
docstore_path: str = "data/docstore.json"
|
| 17 |
+
rag_top_k: int = 4
|
| 18 |
+
|
| 19 |
+
# API
|
| 20 |
+
app_host: str = "0.0.0.0"
|
| 21 |
+
app_port: int = 8000
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
settings = Settings()
|
requirements.txt
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
fastapi
|
| 2 |
+
uvicorn[standard]
|
| 3 |
+
pydantic-settings
|
| 4 |
+
llama-cpp-python
|
| 5 |
+
sentence-transformers
|
| 6 |
+
faiss-cpu
|
| 7 |
+
numpy
|
| 8 |
+
requests
|
| 9 |
+
huggingface_hub
|
scripts/download_model.py
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import argparse
|
| 2 |
+
from pathlib import Path
|
| 3 |
+
|
| 4 |
+
from huggingface_hub import hf_hub_download
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
def main() -> None:
|
| 8 |
+
parser = argparse.ArgumentParser(description="Download GGUF model from Hugging Face")
|
| 9 |
+
parser.add_argument("--repo", required=True, help="HF repo id, e.g. microsoft/Phi-3-mini-4k-instruct-gguf")
|
| 10 |
+
parser.add_argument("--filename", required=True, help="GGUF filename, e.g. Phi-3-mini-4k-instruct-q4.gguf")
|
| 11 |
+
parser.add_argument("--out", default="models", help="Output directory")
|
| 12 |
+
args = parser.parse_args()
|
| 13 |
+
|
| 14 |
+
out_dir = Path(args.out)
|
| 15 |
+
out_dir.mkdir(parents=True, exist_ok=True)
|
| 16 |
+
|
| 17 |
+
file_path = hf_hub_download(
|
| 18 |
+
repo_id=args.repo,
|
| 19 |
+
filename=args.filename,
|
| 20 |
+
local_dir=str(out_dir),
|
| 21 |
+
local_dir_use_symlinks=False,
|
| 22 |
+
)
|
| 23 |
+
|
| 24 |
+
print(f"Downloaded to {file_path}")
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
if __name__ == "__main__":
|
| 28 |
+
main()
|
scripts/start.sh
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/bin/sh
|
| 2 |
+
set -e
|
| 3 |
+
|
| 4 |
+
MODEL_PATH=${MODEL_PATH:-/models/Phi-3-mini-4k-instruct-q4.gguf}
|
| 5 |
+
|
| 6 |
+
if [ ! -f "$MODEL_PATH" ]; then
|
| 7 |
+
python scripts/download_model.py \
|
| 8 |
+
--repo microsoft/Phi-3-mini-4k-instruct-gguf \
|
| 9 |
+
--filename Phi-3-mini-4k-instruct-q4.gguf \
|
| 10 |
+
--out /models
|
| 11 |
+
fi
|
| 12 |
+
|
| 13 |
+
exec uvicorn app.main:app --host 0.0.0.0 --port 8000
|