sitsope commited on
Commit
a32bf9d
·
verified ·
1 Parent(s): d6c9f5a

Upload folder using huggingface_hub

Browse files
Files changed (14) hide show
  1. .gitattributes +1 -35
  2. .gitignore +17 -0
  3. CONTRIBUTING.md +17 -0
  4. Dockerfile +31 -0
  5. LICENSE +21 -0
  6. README.md +118 -3
  7. app/__init__.py +0 -0
  8. app/ingest.py +94 -0
  9. app/main.py +99 -0
  10. app/rag.py +58 -0
  11. app/settings.py +24 -0
  12. requirements.txt +9 -0
  13. scripts/download_model.py +28 -0
  14. scripts/start.sh +13 -0
.gitattributes CHANGED
@@ -1,35 +1 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
- *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
- *.ftz filter=lfs diff=lfs merge=lfs -text
7
- *.gz filter=lfs diff=lfs merge=lfs -text
8
- *.h5 filter=lfs diff=lfs merge=lfs -text
9
- *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
- *.model filter=lfs diff=lfs merge=lfs -text
13
- *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
- *.onnx filter=lfs diff=lfs merge=lfs -text
17
- *.ot filter=lfs diff=lfs merge=lfs -text
18
- *.parquet filter=lfs diff=lfs merge=lfs -text
19
- *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
- *.pt filter=lfs diff=lfs merge=lfs -text
23
- *.pth filter=lfs diff=lfs merge=lfs -text
24
- *.rar filter=lfs diff=lfs merge=lfs -text
25
- *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
- *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
- *.tflite filter=lfs diff=lfs merge=lfs -text
30
- *.tgz filter=lfs diff=lfs merge=lfs -text
31
- *.wasm filter=lfs diff=lfs merge=lfs -text
32
- *.xz filter=lfs diff=lfs merge=lfs -text
33
- *.zip filter=lfs diff=lfs merge=lfs -text
34
- *.zst filter=lfs diff=lfs merge=lfs -text
35
- *tfevents* filter=lfs diff=lfs merge=lfs -text
 
1
+ *.gguf filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
.gitignore ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Python
2
+ .venv/
3
+ __pycache__/
4
+ *.pyc
5
+
6
+ # Local models / data
7
+ models/
8
+ data/index.faiss
9
+ data/docstore.json
10
+
11
+ # Azure local config
12
+ .azure-config/
13
+
14
+ # OS / editor
15
+ .DS_Store
16
+ .vscode/
17
+ .idea/
CONTRIBUTING.md ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Contributing
2
+
3
+ Thanks for considering a contribution.
4
+
5
+ ## Quick guidelines
6
+ - Keep changes focused and minimal.
7
+ - Run tests or a basic smoke test when possible.
8
+ - Update docs if behavior changes.
9
+
10
+ ## Pull requests
11
+ 1. Fork and create a feature branch.
12
+ 2. Make changes with clear commit messages.
13
+ 3. Open a PR describing what/why and how to test.
14
+
15
+ ## Issues
16
+ - Provide steps to reproduce.
17
+ - Include logs or error traces when relevant.
Dockerfile ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.11-slim
2
+
3
+ WORKDIR /app
4
+
5
+ RUN apt-get update \
6
+ && apt-get install -y --no-install-recommends build-essential \
7
+ && rm -rf /var/lib/apt/lists/*
8
+
9
+ # Build llama-cpp without -march=native to avoid illegal instruction on weaker CPUs
10
+ ENV CMAKE_ARGS="-DLLAMA_NATIVE=OFF" \
11
+ FORCE_CMAKE=1
12
+
13
+ COPY requirements.txt ./
14
+ RUN pip install --no-cache-dir -r requirements.txt
15
+
16
+ COPY app ./app
17
+ COPY scripts ./scripts
18
+ COPY data ./data
19
+
20
+ RUN chmod +x /app/scripts/start.sh
21
+
22
+ ENV MODEL_PATH="/models/Phi-3-mini-4k-instruct-q4.gguf" \
23
+ N_THREADS="4" \
24
+ N_GPU_LAYERS="0" \
25
+ N_CTX="4096" \
26
+ RAG_TOP_K="4" \
27
+ APP_PORT="8000"
28
+
29
+ EXPOSE 8000
30
+
31
+ CMD ["/bin/sh", "/app/scripts/start.sh"]
LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Sekponakokou
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
README.md CHANGED
@@ -1,3 +1,118 @@
1
- ---
2
- license: mit
3
- ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Quantized LLM + RAG (FastAPI + FAISS + Phi‑3)
2
+
3
+ ## Goal
4
+ Deploy a small, low‑cost LLM with 4‑bit quantization + RAG, exposed via a clean FastAPI service that can run on CPU‑only servers (e.g., Azure Container Instances).
5
+
6
+ FastAPI API serving a 4‑bit GGUF LLM with a lightweight FAISS RAG pipeline. Designed for low‑cost CPU servers (Azure Container Instances) and local Mac testing.
7
+
8
+ ## Features
9
+ - 4‑bit quantized Phi‑3 GGUF (llama.cpp via `llama-cpp-python`)
10
+ - Simple RAG with FAISS (cosine similarity)
11
+ - Wikipedia public-source ingestion (replaceable)
12
+ - Docker image ready for ACI
13
+
14
+ ## Repo structure
15
+ ```
16
+ app/
17
+ main.py # FastAPI app
18
+ rag.py # FAISS utilities
19
+ ingest.py # build index from public sources
20
+ settings.py # config via env
21
+ scripts/
22
+ download_model.py
23
+ Dockerfile
24
+ requirements.txt
25
+ ```
26
+
27
+ ## Local dev (Mac)
28
+
29
+ ```bash
30
+ python3.12 -m venv .venv
31
+ . .venv/bin/activate
32
+ pip install -r requirements.txt
33
+
34
+ # Download 4-bit Phi-3 GGUF
35
+ python scripts/download_model.py \
36
+ --repo microsoft/Phi-3-mini-4k-instruct-gguf \
37
+ --filename Phi-3-mini-4k-instruct-q4.gguf \
38
+ --out models
39
+
40
+ # Build FAISS index from public pages
41
+ python -m app.ingest --pages "Large_language_model,Azure,Quantization_(signal_processing)" --lang en
42
+
43
+ # Run API
44
+ export MODEL_PATH="models/Phi-3-mini-4k-instruct-q4.gguf"
45
+ export N_GPU_LAYERS="-1" # Metal offload on Mac
46
+ uvicorn app.main:app --host 0.0.0.0 --port 8000
47
+ ```
48
+
49
+ Test:
50
+ ```bash
51
+ curl http://localhost:8000/health
52
+ curl -X POST http://localhost:8000/chat \
53
+ -H "Content-Type: application/json" \
54
+ -d '{"question":"What is quantization in signal processing?"}'
55
+ ```
56
+
57
+ ## Docker (local)
58
+
59
+ Build:
60
+ ```bash
61
+ docker build -t quant-llm .
62
+ ```
63
+
64
+ Run:
65
+ ```bash
66
+ docker run --rm -p 8000:8000 \
67
+ -e MODEL_PATH=/models/Phi-3-mini-4k-instruct-q4.gguf \
68
+ -v "$PWD/models:/models" \
69
+ quant-llm
70
+ ```
71
+
72
+ ## Azure Container Instances (ACI)
73
+
74
+ 1) Build + push to ACR:
75
+ ```bash
76
+ az group create -n rg-quant-llm -l westeurope
77
+ az acr create -n acrquantllm -g rg-quant-llm --sku Basic
78
+ az acr login -n acrquantllm
79
+ az acr build -t quant-llm:1 -r acrquantllm .
80
+ ```
81
+
82
+ 2) Run in ACI (downloads model at startup):
83
+ ```bash
84
+ az container create \
85
+ -g rg-quant-llm \
86
+ -n quant-llm-api \
87
+ --image acrquantllm.azurecr.io/quant-llm:1 \
88
+ --registry-login-server acrquantllm.azurecr.io \
89
+ --registry-username <ACR_USERNAME> \
90
+ --registry-password <ACR_PASSWORD> \
91
+ --cpu 2 --memory 6 \
92
+ --ports 8000 \
93
+ --environment-variables MODEL_PATH=/models/Phi-3-mini-4k-instruct-q4.gguf N_THREADS=2 N_GPU_LAYERS=0 \
94
+ --command-line "bash -lc 'python scripts/download_model.py --repo microsoft/Phi-3-mini-4k-instruct-gguf --filename Phi-3-mini-4k-instruct-q4.gguf --out /models && uvicorn app.main:app --host 0.0.0.0 --port 8000'"
95
+ ```
96
+
97
+ 3) Get public IP:
98
+ ```bash
99
+ az container show -g rg-quant-llm -n quant-llm-api --query ipAddress.ip -o tsv
100
+ ```
101
+
102
+ ## Config
103
+ Environment variables in `app/settings.py`:
104
+ - `MODEL_PATH` (default: `models/phi-3-mini-4k-instruct-q4.gguf`)
105
+ - `N_CTX` (default: 4096)
106
+ - `N_THREADS` (default: 8)
107
+ - `N_GPU_LAYERS` (default: 0, use `-1` on Mac for Metal)
108
+ - `RAG_TOP_K` (default: 4)
109
+
110
+ ## Notes
111
+ - 4‑bit GGUF is the best CPU-friendly option for cost/memory.
112
+ - RAG sources are currently Wikipedia; swap `app/ingest.py` to your own docs.
113
+
114
+ ## Contributing
115
+ See `CONTRIBUTING.md`.
116
+
117
+ ## License
118
+ MIT. See `LICENSE`.
app/__init__.py ADDED
File without changes
app/ingest.py ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import re
3
+ from typing import List, Dict
4
+
5
+ import requests
6
+ from sentence_transformers import SentenceTransformer
7
+
8
+ from .rag import embed_texts, build_faiss_index, save_faiss_index, save_docstore
9
+ from .settings import settings
10
+
11
+
12
+ def fetch_wikipedia_page(title: str, lang: str = "en") -> str:
13
+ url = f"https://{lang}.wikipedia.org/w/api.php"
14
+ headers = {"User-Agent": "quantized-rag/0.1 (local test; contact: dev@example.com)"}
15
+ params = {
16
+ "action": "query",
17
+ "prop": "extracts",
18
+ "explaintext": 1,
19
+ "titles": title,
20
+ "format": "json",
21
+ }
22
+ resp = requests.get(url, headers=headers, params=params, timeout=30)
23
+ resp.raise_for_status()
24
+ data = resp.json()
25
+ pages = data.get("query", {}).get("pages", {})
26
+ if not pages:
27
+ return ""
28
+ page = next(iter(pages.values()))
29
+ return page.get("extract", "")
30
+
31
+
32
+ def chunk_text(text: str, chunk_size: int = 350, overlap: int = 40) -> List[str]:
33
+ words = re.findall(r"\S+", text)
34
+ chunks = []
35
+ start = 0
36
+ while start < len(words):
37
+ end = min(len(words), start + chunk_size)
38
+ chunk = " ".join(words[start:end])
39
+ chunks.append(chunk)
40
+ if end == len(words):
41
+ break
42
+ start = end - overlap
43
+ if start < 0:
44
+ start = 0
45
+ return chunks
46
+
47
+
48
+ def build_docs(titles: List[str], lang: str = "en") -> List[Dict]:
49
+ docs: List[Dict] = []
50
+ for title in titles:
51
+ text = fetch_wikipedia_page(title, lang=lang)
52
+ for i, chunk in enumerate(chunk_text(text)):
53
+ docs.append(
54
+ {
55
+ "id": f"{title}:{i}",
56
+ "title": title,
57
+ "source": f"https://{lang}.wikipedia.org/wiki/{title}",
58
+ "text": chunk,
59
+ }
60
+ )
61
+ return docs
62
+
63
+
64
+ def main() -> None:
65
+ parser = argparse.ArgumentParser(description="Build FAISS index from Wikipedia pages")
66
+ parser.add_argument(
67
+ "--pages",
68
+ required=True,
69
+ help="Comma-separated list of Wikipedia page titles, e.g. 'Azure,Large_language_model'",
70
+ )
71
+ parser.add_argument("--lang", default="en", help="Wikipedia language (default: en)")
72
+ parser.add_argument("--out-index", default=settings.faiss_index_path)
73
+ parser.add_argument("--out-docs", default=settings.docstore_path)
74
+ args = parser.parse_args()
75
+
76
+ titles = [p.strip().replace(" ", "_") for p in args.pages.split(",") if p.strip()]
77
+ if not titles:
78
+ raise SystemExit("No pages provided")
79
+
80
+ docs = build_docs(titles, lang=args.lang)
81
+ embedder = SentenceTransformer(settings.embed_model)
82
+ embeddings = embed_texts(embedder, [d["text"] for d in docs])
83
+
84
+ index = build_faiss_index(embeddings)
85
+ save_faiss_index(args.out_index, index)
86
+ save_docstore(args.out_docs, docs)
87
+
88
+ print(f"Saved {len(docs)} chunks")
89
+ print(f"Index: {args.out_index}")
90
+ print(f"Docstore: {args.out_docs}")
91
+
92
+
93
+ if __name__ == "__main__":
94
+ main()
app/main.py ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+ from typing import List, Optional, Dict
3
+
4
+ from fastapi import FastAPI, HTTPException
5
+ from pydantic import BaseModel
6
+ from llama_cpp import Llama
7
+ from sentence_transformers import SentenceTransformer
8
+
9
+ from .settings import settings
10
+ from .rag import load_docstore, load_faiss_index, retrieve
11
+
12
+
13
+ class ChatRequest(BaseModel):
14
+ question: str
15
+ history: Optional[List[Dict[str, str]]] = None
16
+
17
+
18
+ class ChatResponse(BaseModel):
19
+ answer: str
20
+ sources: List[Dict[str, str]]
21
+
22
+
23
+ app = FastAPI(title="Quantized LLM + RAG")
24
+
25
+ llm: Optional[Llama] = None
26
+ embedder: Optional[SentenceTransformer] = None
27
+ rag_index = None
28
+ rag_docs = None
29
+
30
+
31
+ @app.on_event("startup")
32
+ def load_resources() -> None:
33
+ global llm, embedder, rag_index, rag_docs
34
+
35
+ model_path = Path(settings.model_path)
36
+ if not model_path.exists():
37
+ raise RuntimeError(
38
+ f"Model not found at {model_path}. Set MODEL_PATH env var or download a GGUF model."
39
+ )
40
+
41
+ llm = Llama(
42
+ model_path=str(model_path),
43
+ n_ctx=settings.n_ctx,
44
+ n_threads=settings.n_threads,
45
+ n_gpu_layers=settings.n_gpu_layers,
46
+ )
47
+
48
+ embedder = SentenceTransformer(settings.embed_model)
49
+
50
+ index_path = Path(settings.faiss_index_path)
51
+ docs_path = Path(settings.docstore_path)
52
+ if index_path.exists() and docs_path.exists():
53
+ rag_index = load_faiss_index(str(index_path))
54
+ rag_docs = load_docstore(str(docs_path))
55
+ else:
56
+ rag_index = None
57
+ rag_docs = None
58
+
59
+
60
+ @app.get("/health")
61
+ def health() -> Dict[str, str]:
62
+ return {"status": "ok"}
63
+
64
+
65
+ @app.post("/chat", response_model=ChatResponse)
66
+ def chat(req: ChatRequest) -> ChatResponse:
67
+ if llm is None or embedder is None:
68
+ raise HTTPException(status_code=500, detail="Model not loaded")
69
+ if not req.question.strip():
70
+ raise HTTPException(status_code=400, detail="Question is required")
71
+
72
+ context_blocks = []
73
+ sources: List[Dict[str, str]] = []
74
+ if rag_index is not None and rag_docs is not None:
75
+ results = retrieve(req.question, embedder, rag_index, rag_docs, settings.rag_top_k)
76
+ for doc, score in results:
77
+ context_blocks.append(f"[Source] {doc['text']}")
78
+ sources.append({"title": doc.get("title", ""), "source": doc.get("source", "")})
79
+
80
+ system_prompt = (
81
+ "You are a helpful assistant. Use the provided context to answer. "
82
+ "If the answer is not in the context, say you do not know."
83
+ )
84
+ context = "\n\n".join(context_blocks) if context_blocks else ""
85
+
86
+ prompt = f"<|system|>\n{system_prompt}\n<|user|>\n{req.question}\n"
87
+ if context:
88
+ prompt += f"<|context|>\n{context}\n"
89
+ prompt += "<|assistant|>\n"
90
+
91
+ output = llm(
92
+ prompt,
93
+ temperature=settings.temperature,
94
+ max_tokens=settings.max_tokens,
95
+ stop=["<|user|>", "<|assistant|>", "<|system|>", "</s>"],
96
+ )
97
+
98
+ answer = output["choices"][0]["text"].strip()
99
+ return ChatResponse(answer=answer, sources=sources)
app/rag.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ from typing import List, Dict, Tuple
3
+
4
+ import faiss
5
+ import numpy as np
6
+ from sentence_transformers import SentenceTransformer
7
+
8
+
9
+ def _normalize(vecs: np.ndarray) -> np.ndarray:
10
+ norms = np.linalg.norm(vecs, axis=1, keepdims=True) + 1e-12
11
+ return vecs / norms
12
+
13
+
14
+ def embed_texts(model: SentenceTransformer, texts: List[str]) -> np.ndarray:
15
+ embeddings = model.encode(texts, batch_size=32, show_progress_bar=False)
16
+ embeddings = np.array(embeddings, dtype=np.float32)
17
+ return _normalize(embeddings)
18
+
19
+
20
+ def save_docstore(path: str, docs: List[Dict]) -> None:
21
+ with open(path, "w", encoding="utf-8") as f:
22
+ json.dump(docs, f, ensure_ascii=False, indent=2)
23
+
24
+
25
+ def load_docstore(path: str) -> List[Dict]:
26
+ with open(path, "r", encoding="utf-8") as f:
27
+ return json.load(f)
28
+
29
+
30
+ def build_faiss_index(embeddings: np.ndarray) -> faiss.IndexFlatIP:
31
+ index = faiss.IndexFlatIP(embeddings.shape[1])
32
+ index.add(embeddings)
33
+ return index
34
+
35
+
36
+ def save_faiss_index(path: str, index: faiss.IndexFlatIP) -> None:
37
+ faiss.write_index(index, path)
38
+
39
+
40
+ def load_faiss_index(path: str) -> faiss.IndexFlatIP:
41
+ return faiss.read_index(path)
42
+
43
+
44
+ def retrieve(
45
+ query: str,
46
+ model: SentenceTransformer,
47
+ index: faiss.IndexFlatIP,
48
+ docs: List[Dict],
49
+ top_k: int = 4,
50
+ ) -> List[Tuple[Dict, float]]:
51
+ query_vec = embed_texts(model, [query])
52
+ scores, indices = index.search(query_vec, top_k)
53
+ results: List[Tuple[Dict, float]] = []
54
+ for idx, score in zip(indices[0], scores[0]):
55
+ if idx == -1:
56
+ continue
57
+ results.append((docs[int(idx)], float(score)))
58
+ return results
app/settings.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pydantic_settings import BaseSettings
2
+
3
+
4
+ class Settings(BaseSettings):
5
+ # Model
6
+ model_path: str = "models/phi-3-mini-4k-instruct-q4.gguf"
7
+ n_ctx: int = 4096
8
+ n_threads: int = 8
9
+ n_gpu_layers: int = 0 # set -1 on Mac Metal to offload all layers
10
+ temperature: float = 0.2
11
+ max_tokens: int = 512
12
+
13
+ # RAG
14
+ embed_model: str = "sentence-transformers/all-MiniLM-L6-v2"
15
+ faiss_index_path: str = "data/index.faiss"
16
+ docstore_path: str = "data/docstore.json"
17
+ rag_top_k: int = 4
18
+
19
+ # API
20
+ app_host: str = "0.0.0.0"
21
+ app_port: int = 8000
22
+
23
+
24
+ settings = Settings()
requirements.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ fastapi
2
+ uvicorn[standard]
3
+ pydantic-settings
4
+ llama-cpp-python
5
+ sentence-transformers
6
+ faiss-cpu
7
+ numpy
8
+ requests
9
+ huggingface_hub
scripts/download_model.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ from pathlib import Path
3
+
4
+ from huggingface_hub import hf_hub_download
5
+
6
+
7
+ def main() -> None:
8
+ parser = argparse.ArgumentParser(description="Download GGUF model from Hugging Face")
9
+ parser.add_argument("--repo", required=True, help="HF repo id, e.g. microsoft/Phi-3-mini-4k-instruct-gguf")
10
+ parser.add_argument("--filename", required=True, help="GGUF filename, e.g. Phi-3-mini-4k-instruct-q4.gguf")
11
+ parser.add_argument("--out", default="models", help="Output directory")
12
+ args = parser.parse_args()
13
+
14
+ out_dir = Path(args.out)
15
+ out_dir.mkdir(parents=True, exist_ok=True)
16
+
17
+ file_path = hf_hub_download(
18
+ repo_id=args.repo,
19
+ filename=args.filename,
20
+ local_dir=str(out_dir),
21
+ local_dir_use_symlinks=False,
22
+ )
23
+
24
+ print(f"Downloaded to {file_path}")
25
+
26
+
27
+ if __name__ == "__main__":
28
+ main()
scripts/start.sh ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/sh
2
+ set -e
3
+
4
+ MODEL_PATH=${MODEL_PATH:-/models/Phi-3-mini-4k-instruct-q4.gguf}
5
+
6
+ if [ ! -f "$MODEL_PATH" ]; then
7
+ python scripts/download_model.py \
8
+ --repo microsoft/Phi-3-mini-4k-instruct-gguf \
9
+ --filename Phi-3-mini-4k-instruct-q4.gguf \
10
+ --out /models
11
+ fi
12
+
13
+ exec uvicorn app.main:app --host 0.0.0.0 --port 8000