Upload folder using huggingface_hub

Browse files

Files changed (14) hide show

.gitattributes +1 -35
.gitignore +17 -0
CONTRIBUTING.md +17 -0
Dockerfile +31 -0
LICENSE +21 -0
README.md +118 -3
app/__init__.py +0 -0
app/ingest.py +94 -0
app/main.py +99 -0
app/rag.py +58 -0
app/settings.py +24 -0
requirements.txt +9 -0
scripts/download_model.py +28 -0
scripts/start.sh +13 -0

.gitattributes CHANGED Viewed

@@ -1,35 +1 @@
-*.7z filter=lfs diff=lfs merge=lfs -text
-*.arrow filter=lfs diff=lfs merge=lfs -text
-*.bin filter=lfs diff=lfs merge=lfs -text
-*.bz2 filter=lfs diff=lfs merge=lfs -text
-*.ckpt filter=lfs diff=lfs merge=lfs -text
-*.ftz filter=lfs diff=lfs merge=lfs -text
-*.gz filter=lfs diff=lfs merge=lfs -text
-*.h5 filter=lfs diff=lfs merge=lfs -text
-*.joblib filter=lfs diff=lfs merge=lfs -text
-*.lfs.* filter=lfs diff=lfs merge=lfs -text
-*.mlmodel filter=lfs diff=lfs merge=lfs -text
-*.model filter=lfs diff=lfs merge=lfs -text
-*.msgpack filter=lfs diff=lfs merge=lfs -text
-*.npy filter=lfs diff=lfs merge=lfs -text
-*.npz filter=lfs diff=lfs merge=lfs -text
-*.onnx filter=lfs diff=lfs merge=lfs -text
-*.ot filter=lfs diff=lfs merge=lfs -text
-*.parquet filter=lfs diff=lfs merge=lfs -text
-*.pb filter=lfs diff=lfs merge=lfs -text
-*.pickle filter=lfs diff=lfs merge=lfs -text
-*.pkl filter=lfs diff=lfs merge=lfs -text
-*.pt filter=lfs diff=lfs merge=lfs -text
-*.pth filter=lfs diff=lfs merge=lfs -text
-*.rar filter=lfs diff=lfs merge=lfs -text
-*.safetensors filter=lfs diff=lfs merge=lfs -text
-saved_model/**/* filter=lfs diff=lfs merge=lfs -text
-*.tar.* filter=lfs diff=lfs merge=lfs -text
-*.tar filter=lfs diff=lfs merge=lfs -text
-*.tflite filter=lfs diff=lfs merge=lfs -text
-*.tgz filter=lfs diff=lfs merge=lfs -text
-*.wasm filter=lfs diff=lfs merge=lfs -text
-*.xz filter=lfs diff=lfs merge=lfs -text
-*.zip filter=lfs diff=lfs merge=lfs -text
-*.zst filter=lfs diff=lfs merge=lfs -text
-*tfevents* filter=lfs diff=lfs merge=lfs -text


1	+ *.gguf filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,17 @@

+# Python
+.venv/
+__pycache__/
+*.pyc
+# Local models / data
+models/
+data/index.faiss
+data/docstore.json
+# Azure local config
+.azure-config/
+# OS / editor
+.DS_Store
+.vscode/
+.idea/

CONTRIBUTING.md ADDED Viewed

	@@ -0,0 +1,17 @@

+# Contributing
+Thanks for considering a contribution.
+## Quick guidelines
+- Keep changes focused and minimal.
+- Run tests or a basic smoke test when possible.
+- Update docs if behavior changes.
+## Pull requests
+1. Fork and create a feature branch.
+2. Make changes with clear commit messages.
+3. Open a PR describing what/why and how to test.
+## Issues
+- Provide steps to reproduce.
+- Include logs or error traces when relevant.

Dockerfile ADDED Viewed

	@@ -0,0 +1,31 @@

+FROM python:3.11-slim
+WORKDIR /app
+RUN apt-get update \
+    && apt-get install -y --no-install-recommends build-essential \
+    && rm -rf /var/lib/apt/lists/*
+# Build llama-cpp without -march=native to avoid illegal instruction on weaker CPUs
+ENV CMAKE_ARGS="-DLLAMA_NATIVE=OFF" \
+    FORCE_CMAKE=1
+COPY requirements.txt ./
+RUN pip install --no-cache-dir -r requirements.txt
+COPY app ./app
+COPY scripts ./scripts
+COPY data ./data
+RUN chmod +x /app/scripts/start.sh
+ENV MODEL_PATH="/models/Phi-3-mini-4k-instruct-q4.gguf" \
+    N_THREADS="4" \
+    N_GPU_LAYERS="0" \
+    N_CTX="4096" \
+    RAG_TOP_K="4" \
+    APP_PORT="8000"
+EXPOSE 8000
+CMD ["/bin/sh", "/app/scripts/start.sh"]

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2026 Sekponakokou
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

README.md CHANGED Viewed

@@ -1,3 +1,118 @@
----
-license: mit
----

+# Quantized LLM + RAG (FastAPI + FAISS + Phi‑3)
+## Goal
+Deploy a small, low‑cost LLM with 4‑bit quantization + RAG, exposed via a clean FastAPI service that can run on CPU‑only servers (e.g., Azure Container Instances).
+FastAPI API serving a 4‑bit GGUF LLM with a lightweight FAISS RAG pipeline. Designed for low‑cost CPU servers (Azure Container Instances) and local Mac testing.
+## Features
+- 4‑bit quantized Phi‑3 GGUF (llama.cpp via `llama-cpp-python`)
+- Simple RAG with FAISS (cosine similarity)
+- Wikipedia public-source ingestion (replaceable)
+- Docker image ready for ACI
+## Repo structure
+```
+app/
+  main.py        # FastAPI app
+  rag.py         # FAISS utilities
+  ingest.py      # build index from public sources
+  settings.py    # config via env
+scripts/
+  download_model.py
+Dockerfile
+requirements.txt
+```
+## Local dev (Mac)
+```bash
+python3.12 -m venv .venv
+. .venv/bin/activate
+pip install -r requirements.txt
+# Download 4-bit Phi-3 GGUF
+python scripts/download_model.py \
+  --repo microsoft/Phi-3-mini-4k-instruct-gguf \
+  --filename Phi-3-mini-4k-instruct-q4.gguf \
+  --out models
+# Build FAISS index from public pages
+python -m app.ingest --pages "Large_language_model,Azure,Quantization_(signal_processing)" --lang en
+# Run API
+export MODEL_PATH="models/Phi-3-mini-4k-instruct-q4.gguf"
+export N_GPU_LAYERS="-1"   # Metal offload on Mac
+uvicorn app.main:app --host 0.0.0.0 --port 8000
+```
+Test:
+```bash
+curl http://localhost:8000/health
+curl -X POST http://localhost:8000/chat \
+  -H "Content-Type: application/json" \
+  -d '{"question":"What is quantization in signal processing?"}'
+```
+## Docker (local)
+Build:
+```bash
+docker build -t quant-llm .
+```
+Run:
+```bash
+docker run --rm -p 8000:8000 \
+  -e MODEL_PATH=/models/Phi-3-mini-4k-instruct-q4.gguf \
+  -v "$PWD/models:/models" \
+  quant-llm
+```
+## Azure Container Instances (ACI)
+1) Build + push to ACR:
+```bash
+az group create -n rg-quant-llm -l westeurope
+az acr create -n acrquantllm -g rg-quant-llm --sku Basic
+az acr login -n acrquantllm
+az acr build -t quant-llm:1 -r acrquantllm .
+```
+2) Run in ACI (downloads model at startup):
+```bash
+az container create \
+  -g rg-quant-llm \
+  -n quant-llm-api \
+  --image acrquantllm.azurecr.io/quant-llm:1 \
+  --registry-login-server acrquantllm.azurecr.io \
+  --registry-username <ACR_USERNAME> \
+  --registry-password <ACR_PASSWORD> \
+  --cpu 2 --memory 6 \
+  --ports 8000 \
+  --environment-variables MODEL_PATH=/models/Phi-3-mini-4k-instruct-q4.gguf N_THREADS=2 N_GPU_LAYERS=0 \
+  --command-line "bash -lc 'python scripts/download_model.py --repo microsoft/Phi-3-mini-4k-instruct-gguf --filename Phi-3-mini-4k-instruct-q4.gguf --out /models && uvicorn app.main:app --host 0.0.0.0 --port 8000'"
+```
+3) Get public IP:
+```bash
+az container show -g rg-quant-llm -n quant-llm-api --query ipAddress.ip -o tsv
+```
+## Config
+Environment variables in `app/settings.py`:
+- `MODEL_PATH` (default: `models/phi-3-mini-4k-instruct-q4.gguf`)
+- `N_CTX` (default: 4096)
+- `N_THREADS` (default: 8)
+- `N_GPU_LAYERS` (default: 0, use `-1` on Mac for Metal)
+- `RAG_TOP_K` (default: 4)
+## Notes
+- 4‑bit GGUF is the best CPU-friendly option for cost/memory.
+- RAG sources are currently Wikipedia; swap `app/ingest.py` to your own docs.
+## Contributing
+See `CONTRIBUTING.md`.
+## License
+MIT. See `LICENSE`.

app/__init__.py ADDED Viewed

File without changes

app/ingest.py ADDED Viewed

	@@ -0,0 +1,94 @@

+import argparse
+import re
+from typing import List, Dict
+import requests
+from sentence_transformers import SentenceTransformer
+from .rag import embed_texts, build_faiss_index, save_faiss_index, save_docstore
+from .settings import settings
+def fetch_wikipedia_page(title: str, lang: str = "en") -> str:
+    url = f"https://{lang}.wikipedia.org/w/api.php"
+    headers = {"User-Agent": "quantized-rag/0.1 (local test; contact: dev@example.com)"}
+    params = {
+        "action": "query",
+        "prop": "extracts",
+        "explaintext": 1,
+        "titles": title,
+        "format": "json",
+    }
+    resp = requests.get(url, headers=headers, params=params, timeout=30)
+    resp.raise_for_status()
+    data = resp.json()
+    pages = data.get("query", {}).get("pages", {})
+    if not pages:
+        return ""
+    page = next(iter(pages.values()))
+    return page.get("extract", "")
+def chunk_text(text: str, chunk_size: int = 350, overlap: int = 40) -> List[str]:
+    words = re.findall(r"\S+", text)
+    chunks = []
+    start = 0
+    while start < len(words):
+        end = min(len(words), start + chunk_size)
+        chunk = " ".join(words[start:end])
+        chunks.append(chunk)
+        if end == len(words):
+            break
+        start = end - overlap
+        if start < 0:
+            start = 0
+    return chunks
+def build_docs(titles: List[str], lang: str = "en") -> List[Dict]:
+    docs: List[Dict] = []
+    for title in titles:
+        text = fetch_wikipedia_page(title, lang=lang)
+        for i, chunk in enumerate(chunk_text(text)):
+            docs.append(
+                {
+                    "id": f"{title}:{i}",
+                    "title": title,
+                    "source": f"https://{lang}.wikipedia.org/wiki/{title}",
+                    "text": chunk,
+                }
+            )
+    return docs
+def main() -> None:
+    parser = argparse.ArgumentParser(description="Build FAISS index from Wikipedia pages")
+    parser.add_argument(
+        "--pages",
+        required=True,
+        help="Comma-separated list of Wikipedia page titles, e.g. 'Azure,Large_language_model'",
+    )
+    parser.add_argument("--lang", default="en", help="Wikipedia language (default: en)")
+    parser.add_argument("--out-index", default=settings.faiss_index_path)
+    parser.add_argument("--out-docs", default=settings.docstore_path)
+    args = parser.parse_args()
+    titles = [p.strip().replace(" ", "_") for p in args.pages.split(",") if p.strip()]
+    if not titles:
+        raise SystemExit("No pages provided")
+    docs = build_docs(titles, lang=args.lang)
+    embedder = SentenceTransformer(settings.embed_model)
+    embeddings = embed_texts(embedder, [d["text"] for d in docs])
+    index = build_faiss_index(embeddings)
+    save_faiss_index(args.out_index, index)
+    save_docstore(args.out_docs, docs)
+    print(f"Saved {len(docs)} chunks")
+    print(f"Index: {args.out_index}")
+    print(f"Docstore: {args.out_docs}")
+if __name__ == "__main__":
+    main()

app/main.py ADDED Viewed

	@@ -0,0 +1,99 @@

+from pathlib import Path
+from typing import List, Optional, Dict
+from fastapi import FastAPI, HTTPException
+from pydantic import BaseModel
+from llama_cpp import Llama
+from sentence_transformers import SentenceTransformer
+from .settings import settings
+from .rag import load_docstore, load_faiss_index, retrieve
+class ChatRequest(BaseModel):
+    question: str
+    history: Optional[List[Dict[str, str]]] = None
+class ChatResponse(BaseModel):
+    answer: str
+    sources: List[Dict[str, str]]
+app = FastAPI(title="Quantized LLM + RAG")
+llm: Optional[Llama] = None
+embedder: Optional[SentenceTransformer] = None
+rag_index = None
+rag_docs = None
+@app.on_event("startup")
+def load_resources() -> None:
+    global llm, embedder, rag_index, rag_docs
+    model_path = Path(settings.model_path)
+    if not model_path.exists():
+        raise RuntimeError(
+            f"Model not found at {model_path}. Set MODEL_PATH env var or download a GGUF model."
+        )
+    llm = Llama(
+        model_path=str(model_path),
+        n_ctx=settings.n_ctx,
+        n_threads=settings.n_threads,
+        n_gpu_layers=settings.n_gpu_layers,
+    )
+    embedder = SentenceTransformer(settings.embed_model)
+    index_path = Path(settings.faiss_index_path)
+    docs_path = Path(settings.docstore_path)
+    if index_path.exists() and docs_path.exists():
+        rag_index = load_faiss_index(str(index_path))
+        rag_docs = load_docstore(str(docs_path))
+    else:
+        rag_index = None
+        rag_docs = None
+@app.get("/health")
+def health() -> Dict[str, str]:
+    return {"status": "ok"}
+@app.post("/chat", response_model=ChatResponse)
+def chat(req: ChatRequest) -> ChatResponse:
+    if llm is None or embedder is None:
+        raise HTTPException(status_code=500, detail="Model not loaded")
+    if not req.question.strip():
+        raise HTTPException(status_code=400, detail="Question is required")
+    context_blocks = []
+    sources: List[Dict[str, str]] = []
+    if rag_index is not None and rag_docs is not None:
+        results = retrieve(req.question, embedder, rag_index, rag_docs, settings.rag_top_k)
+        for doc, score in results:
+            context_blocks.append(f"[Source] {doc['text']}")
+            sources.append({"title": doc.get("title", ""), "source": doc.get("source", "")})
+    system_prompt = (
+        "You are a helpful assistant. Use the provided context to answer. "
+        "If the answer is not in the context, say you do not know."
+    )
+    context = "\n\n".join(context_blocks) if context_blocks else ""
+    prompt = f"<|system|>\n{system_prompt}\n<|user|>\n{req.question}\n"
+    if context:
+        prompt += f"<|context|>\n{context}\n"
+    prompt += "<|assistant|>\n"
+    output = llm(
+        prompt,
+        temperature=settings.temperature,
+        max_tokens=settings.max_tokens,
+        stop=["<|user|>", "<|assistant|>", "<|system|>", "</s>"],
+    )
+    answer = output["choices"][0]["text"].strip()
+    return ChatResponse(answer=answer, sources=sources)

app/rag.py ADDED Viewed

	@@ -0,0 +1,58 @@

+import json
+from typing import List, Dict, Tuple
+import faiss
+import numpy as np
+from sentence_transformers import SentenceTransformer
+def _normalize(vecs: np.ndarray) -> np.ndarray:
+    norms = np.linalg.norm(vecs, axis=1, keepdims=True) + 1e-12
+    return vecs / norms
+def embed_texts(model: SentenceTransformer, texts: List[str]) -> np.ndarray:
+    embeddings = model.encode(texts, batch_size=32, show_progress_bar=False)
+    embeddings = np.array(embeddings, dtype=np.float32)
+    return _normalize(embeddings)
+def save_docstore(path: str, docs: List[Dict]) -> None:
+    with open(path, "w", encoding="utf-8") as f:
+        json.dump(docs, f, ensure_ascii=False, indent=2)
+def load_docstore(path: str) -> List[Dict]:
+    with open(path, "r", encoding="utf-8") as f:
+        return json.load(f)
+def build_faiss_index(embeddings: np.ndarray) -> faiss.IndexFlatIP:
+    index = faiss.IndexFlatIP(embeddings.shape[1])
+    index.add(embeddings)
+    return index
+def save_faiss_index(path: str, index: faiss.IndexFlatIP) -> None:
+    faiss.write_index(index, path)
+def load_faiss_index(path: str) -> faiss.IndexFlatIP:
+    return faiss.read_index(path)
+def retrieve(
+    query: str,
+    model: SentenceTransformer,
+    index: faiss.IndexFlatIP,
+    docs: List[Dict],
+    top_k: int = 4,
+) -> List[Tuple[Dict, float]]:
+    query_vec = embed_texts(model, [query])
+    scores, indices = index.search(query_vec, top_k)
+    results: List[Tuple[Dict, float]] = []
+    for idx, score in zip(indices[0], scores[0]):
+        if idx == -1:
+            continue
+        results.append((docs[int(idx)], float(score)))
+    return results

app/settings.py ADDED Viewed

	@@ -0,0 +1,24 @@

+from pydantic_settings import BaseSettings
+class Settings(BaseSettings):
+    # Model
+    model_path: str = "models/phi-3-mini-4k-instruct-q4.gguf"
+    n_ctx: int = 4096
+    n_threads: int = 8
+    n_gpu_layers: int = 0  # set -1 on Mac Metal to offload all layers
+    temperature: float = 0.2
+    max_tokens: int = 512
+    # RAG
+    embed_model: str = "sentence-transformers/all-MiniLM-L6-v2"
+    faiss_index_path: str = "data/index.faiss"
+    docstore_path: str = "data/docstore.json"
+    rag_top_k: int = 4
+    # API
+    app_host: str = "0.0.0.0"
+    app_port: int = 8000
+settings = Settings()

requirements.txt ADDED Viewed

	@@ -0,0 +1,9 @@

+fastapi
+uvicorn[standard]
+pydantic-settings
+llama-cpp-python
+sentence-transformers
+faiss-cpu
+numpy
+requests
+huggingface_hub

scripts/download_model.py ADDED Viewed

	@@ -0,0 +1,28 @@

+import argparse
+from pathlib import Path
+from huggingface_hub import hf_hub_download
+def main() -> None:
+    parser = argparse.ArgumentParser(description="Download GGUF model from Hugging Face")
+    parser.add_argument("--repo", required=True, help="HF repo id, e.g. microsoft/Phi-3-mini-4k-instruct-gguf")
+    parser.add_argument("--filename", required=True, help="GGUF filename, e.g. Phi-3-mini-4k-instruct-q4.gguf")
+    parser.add_argument("--out", default="models", help="Output directory")
+    args = parser.parse_args()
+    out_dir = Path(args.out)
+    out_dir.mkdir(parents=True, exist_ok=True)
+    file_path = hf_hub_download(
+        repo_id=args.repo,
+        filename=args.filename,
+        local_dir=str(out_dir),
+        local_dir_use_symlinks=False,
+    )
+    print(f"Downloaded to {file_path}")
+if __name__ == "__main__":
+    main()

scripts/start.sh ADDED Viewed

	@@ -0,0 +1,13 @@

+#!/bin/sh
+set -e
+MODEL_PATH=${MODEL_PATH:-/models/Phi-3-mini-4k-instruct-q4.gguf}
+if [ ! -f "$MODEL_PATH" ]; then
+  python scripts/download_model.py \
+    --repo microsoft/Phi-3-mini-4k-instruct-gguf \
+    --filename Phi-3-mini-4k-instruct-q4.gguf \
+    --out /models
+fi
+exec uvicorn app.main:app --host 0.0.0.0 --port 8000