Spaces:

robertolofaro
/

articles

Running

App Files Files Community

robertolofaro commited on 10 days ago

Commit

7af5e1b

verified ·

1 Parent(s): 285322b

Upload app.py

Browse files

Files changed (1) hide show

app.py +78 -314

app.py CHANGED Viewed

@@ -1,134 +1,62 @@
-"""
-app.py  –  Article Q&A chatbot
-Runs on:
-  • Hugging Face Spaces  (CPU-only, default)
-  • Local PC             (CPU or CUDA GPU)
-Environment variables
----------------------
-HF_TOKEN            HuggingFace token for private model repo  (required on HF Space)
-LOCAL_MODE          Set to "1" to force local-PC behaviour     (optional; auto-detected via SPACE_ID)
-LOCAL_MODEL_PATH    Absolute path to the .gguf file on disk    (optional; skips HF hub download)
-GITHUB_TOKEN        GitHub PAT for higher rate-limits          (optional; works without it)
-N_THREADS           Override CPU thread count                  (optional)
-"""
 import gradio as gr
 from llama_cpp import Llama
 from huggingface_hub import hf_hub_download
 import os
 import pickle
-import requests
-from datetime import datetime, timedelta
 from langchain_huggingface import HuggingFaceEmbeddings
-# ====================== ENVIRONMENT DETECTION ======================
-# HuggingFace Spaces always set SPACE_ID; absent → we're running locally.
-IS_HF_SPACE = bool(os.environ.get("SPACE_ID"))
-IS_LOCAL    = (not IS_HF_SPACE) or (os.environ.get("LOCAL_MODE", "0") == "1")
-def _detect_cuda() -> bool:
-    """Return True only when a CUDA device is actually usable by llama-cpp."""
-    if not IS_LOCAL:
-        return False          # HF free tier is CPU-only
-    try:
-        import torch
-        return torch.cuda.is_available()
-    except ImportError:
-        pass
-    # Fallback: check for libcuda without torch
-    try:
-        import ctypes
-        ctypes.cdll.LoadLibrary("libcuda.so.1")
-        return True
-    except Exception:
-        return False
-CUDA_AVAILABLE = _detect_cuda()
-# -1  → offload every layer to GPU;  0 → pure CPU
-N_GPU_LAYERS   = -1 if CUDA_AVAILABLE else 0
-# Use all available cores locally; HF free tier: keep at 2 to avoid OOM
-N_THREADS      = int(os.environ.get("N_THREADS", os.cpu_count() if IS_LOCAL else 2))
 # ====================== CONFIG ======================
-REPO_ID            = "robertolofaro/articles-model"
-MODEL_FILENAME     = "articles-Q4_K_M.gguf"
 BACKENDS = {
     "FAISS - RAG (HNSW)": "FAISS",
-    "Qdrant - RAG":        "Qdrant",
 }
-FAISS_PATH         = "faiss_index_hnsw"
-QDRANT_PATH        = "qdrant_db"
-QDRANT_COLLECTION  = "articles"
-# MorningNews GitHub location
-GH_OWNER           = "robertolofaro"
-GH_REPO            = "supportmaterial"
-GH_NEWS_PATH       = "MorningNewsAgentTest"
-GH_API_ROOT        = "https://api.github.com"
-GH_RAW_ROOT        = "https://raw.githubusercontent.com"
-NEWS_ACCEPTED_EXT  = (".txt", ".md", ".json")
-NEWS_MAX_CHARS_FILE = 2000   # chars kept per file
-NEWS_MAX_CHARS_TOTAL = 3500  # total chars injected into prompt
-NEWS_CACHE_TTL     = timedelta(hours=1)
-# Web search
-WEB_MAX_RESULTS    = 5
-WEB_MAX_CHARS      = 2500    # total chars from web injected into prompt
-# ====================== LOAD METADATA ======================
-def load_articles_list() -> list[str]:
     try:
         with open("metadata.pkl", "rb") as f:
             df = pickle.load(f)
-        cats = sorted(df["article_category"].unique().tolist())
-        return ["All categories"] + cats
-    except Exception:
         return ["All categories"]
 ARTICLE_LIST = load_articles_list()
 # ====================== LOAD LLM ======================
-def _load_llm() -> Llama:
-    local_path = os.environ.get("LOCAL_MODEL_PATH", "")
-    if IS_LOCAL and local_path and os.path.isfile(local_path):
-        model_path = local_path
-        print(f"[LLM] Loading from local path: {model_path}")
-    else:
-        model_path = hf_hub_download(
-            repo_id=REPO_ID,
-            filename=MODEL_FILENAME,
-            repo_type="model",
-            token=os.environ.get("HF_TOKEN"),
-        )
-        print(f"[LLM] Downloaded from HF hub → {model_path}")
-    print(f"[LLM] n_gpu_layers={N_GPU_LAYERS}  n_threads={N_THREADS}  cuda={CUDA_AVAILABLE}")
-    return Llama(
-        model_path=model_path,
-        n_ctx=4096,
-        n_threads=N_THREADS,
-        n_batch=512,
-        n_ubatch=512,
-        n_gpu_layers=N_GPU_LAYERS,
-        verbose=False,
-    )
-llm = _load_llm()
-# ====================== RAG VECTORSTORE CACHE ======================
-_vectorstores: dict = {}
 def get_vectorstore(backend_name: str):
-    if backend_name in _vectorstores:
-        return _vectorstores[backend_name]
     try:
-        embeddings = HuggingFaceEmbeddings(
-            model_name="BAAI/bge-small-en-v1.5",
-            encode_kwargs={"normalize_embeddings": True},
-        )
         if backend_name == "FAISS":
             from langchain_community.vectorstores import FAISS
             vs = FAISS.load_local(FAISS_PATH, embeddings, allow_dangerous_deserialization=True)
@@ -138,167 +66,45 @@ def get_vectorstore(backend_name: str):
         else:
             from langchain_community.vectorstores import FAISS
             vs = FAISS.load_local(FAISS_PATH, embeddings, allow_dangerous_deserialization=True)
-        _vectorstores[backend_name] = vs
         return vs
-    except Exception as e:
-        print(f"[RAG] Could not load vectorstore '{backend_name}': {e}")
         return None
-# ====================== MORNING NEWS FETCHER ======================
-_news_cache: dict = {"content": None, "fetched_at": None}
-def fetch_morning_news() -> str:
-    """
-    Fetch text/md/json files from the MorningNewsAgentTest directory on GitHub.
-    Results are cached for NEWS_CACHE_TTL to avoid hammering the API.
-    Works with or without a GITHUB_TOKEN (unauthenticated rate-limit: 60 req/hr).
-    """
-    global _news_cache
-    now = datetime.utcnow()
-    # Serve from cache if still fresh
-    if _news_cache["content"] is not None and _news_cache["fetched_at"]:
-        if now - _news_cache["fetched_at"] < NEWS_CACHE_TTL:
-            print("[MorningNews] Serving from cache")
-            return _news_cache["content"]
-    headers = {"Accept": "application/vnd.github.v3+json"}
-    gh_token = os.environ.get("GITHUB_TOKEN", "")
-    if gh_token:
-        headers["Authorization"] = f"token {gh_token}"
-    try:
-        # List files in the directory
-        dir_url = f"{GH_API_ROOT}/repos/{GH_OWNER}/{GH_REPO}/contents/{GH_NEWS_PATH}"
-        resp = requests.get(dir_url, headers=headers, timeout=10)
-        resp.raise_for_status()
-        entries = resp.json()
-        # Sort by name descending so the most recent file (date-prefixed) comes first
-        entries = sorted(
-            [e for e in entries if e["type"] == "file"
-             and e["name"].lower().endswith(NEWS_ACCEPTED_EXT)],
-            key=lambda e: e["name"],
-            reverse=True,
-        )
-        collected, total_chars = [], 0
-        for entry in entries:
-            if total_chars >= NEWS_MAX_CHARS_TOTAL:
-                break
-            raw_url = entry["download_url"]
-            try:
-                file_resp = requests.get(raw_url, headers=headers, timeout=10)
-                file_resp.raise_for_status()
-                snippet = file_resp.text[:NEWS_MAX_CHARS_FILE]
-                collected.append(f"--- [{entry['name']}] ---\n{snippet}")
-                total_chars += len(snippet)
-            except Exception as fe:
-                print(f"[MorningNews] Could not fetch {entry['name']}: {fe}")
-        combined = "\n\n".join(collected)[:NEWS_MAX_CHARS_TOTAL]
-        _news_cache = {"content": combined, "fetched_at": now}
-        print(f"[MorningNews] Fetched {len(collected)} file(s), {len(combined)} chars")
-        return combined
-    except Exception as e:
-        print(f"[MorningNews] Directory listing failed: {e}")
-        # Return stale cache rather than nothing if available
-        return _news_cache.get("content") or ""
-# ====================== SYSTEM PROMPTS ======================
-# Base prompt – articles only
-SYSTEM_PROMPT_BASE = """You are the reference expert for the articles contained in the training of this model, \
-all extracted from the website robertolofaro.com, and all focused on change.
-# Your Mission
-When a user asks a question, provide a structured response based ONLY on the articles in your training. \
-Do not provide general advice from outside these sources.
-# Response Format
-1. Executive Summary: A 2-3 sentence overview answering the core query.
-2. Guidelines & Hints: A markdown list of specific answers/guidelines/hints found in the source material.
-"""
-# Extended prompt – when extra sources are active
-SYSTEM_PROMPT_EXTENDED = """You are the reference expert for the articles contained in the training of this model, \
-all extracted from the website robertolofaro.com, and all focused on change. \
-You have also been provided with supplementary external context (morning news results).
-# Your Mission
-Provide a structured response that integrates all available information. \
-Clearly tag each insight with its source label so the reader can judge its provenance:
-  [Articles]     – insight from the trained article corpus
-  [MorningNews]  – insight from the morning news briefing
-# Response Format
 1. Executive Summary: A 2-3 sentence overview answering the core query.
-2. Guidelines & Hints: A markdown list of tagged insights from the source material.
-3. Additional Context (when MorningNews are present): \
-   brief synthesis of external findings relevant to the query.
 """
-# ====================== CONTEXT BUDGET HELPER ======================
-# Rough token estimate: 1 token ≈ 4 chars for English text.
-# n_ctx=4096 → reserve ~800 for answer, ~400 for system+history → ~2900 chars for context.
-CONTEXT_BUDGET_CHARS = 2900
-def _trim_to_budget(parts: list[tuple[str, str]]) -> str:
-    """
-    parts = [(label, text), ...]
-    Allocates the context budget proportionally across available sources,
-    then returns a single assembled context string.
-    """
-    # First pass: measure totals
-    totals = [(label, text) for label, text in parts if text.strip()]
-    if not totals:
-        return ""
-    per_source = CONTEXT_BUDGET_CHARS // len(totals)
-    sections = []
-    for label, text in totals:
-        trimmed = text[:per_source]
-        sections.append(f"=== {label} ===\n{trimmed}")
-    return "\n\n".join(sections)
 # ====================== GENERATION FUNCTION ======================
-def generate_response(
-    message, history,
-    rag_mode, article_filter,
-    use_morning_news,
-    max_tokens, temperature, top_p, repeat_penalty,
-):
-    has_extra = use_morning_news
-    system_prompt = SYSTEM_PROMPT_EXTENDED if has_extra else SYSTEM_PROMPT_BASE
-    full_prompt = f"<|im_start|>system\n{system_prompt}<|im_end|>\n"
-    # Keep the last 4 turns to limit context pressure
     for msg in history[-4:]:
         full_prompt += f"<|im_start|>{msg['role']}\n{msg['content']}<|im_end|>\n"
-    # ---- Gather context from all active sources ----
-    context_parts: list[tuple[str, str]] = []
-    # 1. RAG (vectorstore)
     backend = BACKENDS.get(rag_mode)
     if backend:
         vs = get_vectorstore(backend)
         if vs:
             try:
-                filt = {"article_category": article_filter} if article_filter != "All categories" else None
-                docs = vs.similarity_search(message, k=5, filter=filt)
-                rag_text = "\n\n".join(
-                    f"[Cat: {d.metadata.get('article_category','N/A')}] {d.page_content[:700]}"
-                    for d in docs
-                )
-                context_parts.append(("ARTICLES CONTEXT", rag_text))
-            except Exception as e:
-                print(f"[RAG] similarity_search failed: {e}")
-    # 2. Morning News
-    if use_morning_news:
-        news = fetch_morning_news()
-        if news:
-            context_parts.append(("MORNING NEWS BRIEFING", news))
-    # ---- Assemble context within token budget ----
-    context = _trim_to_budget(context_parts)
     if context:
         full_prompt += f"<|im_start|>user\nContext:\n{context}\n\nQuestion: {message}<|im_end|>\n"
@@ -307,99 +113,57 @@ def generate_response(
     full_prompt += "<|im_start|>assistant\n"
-    # ---- Inference parameters ----
-    max_tok  = int(max_tokens)    if max_tokens    is not None else 900
-    temp     = float(temperature) if temperature   is not None else 0.65
-    tp       = float(top_p)       if top_p         is not None else 0.9
-    rep_pen  = float(repeat_penalty) if repeat_penalty is not None else 1.1
-    partial = ""
     for chunk in llm(
         full_prompt,
-        max_tokens=max_tok,
-        temperature=temp,
-        top_p=tp,
-        repeat_penalty=rep_pen,
         stop=["<|im_end|>", "<|im_start|>"],
         stream=True,
     ):
-        partial += chunk["choices"][0]["text"]
-        yield partial
-# ====================== RUNTIME STATUS BADGE ======================
-def _build_status() -> str:
-    parts = []
-    if IS_HF_SPACE and not IS_LOCAL:
-        parts.append("☁️ HuggingFace Space · CPU-only")
-    else:
-        parts.append("🖥️ Local mode")
-        parts.append("⚡ GPU (CUDA)" if CUDA_AVAILABLE else "🐢 CPU-only")
-    parts.append(f"threads={N_THREADS}")
-    return "  |  ".join(parts)
-STATUS_LINE = _build_status()
 # ====================== GRADIO INTERFACE ======================
 with gr.Blocks(title="Article Q&A model") as demo:
     gr.Markdown("# sourcing 350+ articles on change")
-    gr.Markdown(
-        "Qwen3.5-4B DoRA fine-tuned on 350+ articles on change from robertolofaro.com — "
-        "experimental on CPU-only, to test embedding methods (takes a few minutes, "
-        "no selection for the category yet) — updated as of 2026-05-05"
-    )
-    gr.Markdown(f"**Runtime:** {STATUS_LINE}")
-    gr.Markdown(
-        "**NOTAM:** by querying this model you access the articles and metadata "
-        "available on robertolofaro.com and GitHub.  "
-        "Answers reflect the article corpus only — do not treat them as advice specific to your context."
-    )
-    gr.Markdown(
-        "If, after getting an answer, you want something more contextualised, "
-        "contact a consultant (myself included)."
-    )
     with gr.Row():
         rag_mode = gr.Radio(
             choices=list(BACKENDS.keys()),
             value="FAISS - RAG (HNSW)",
-            label="Retrieval mode",
         )
         article_filter = gr.Dropdown(
             choices=ARTICLE_LIST,
             value="All categories",
-            label="Focus on category",
-        )
-    with gr.Row():
-        use_morning_news = gr.Checkbox(
-            value=False,
-            label="📰 Read MorningNews",
-            info="Supplement with the latest Morning News briefing fetched from GitHub "
-                 f"(robertolofaro/supportmaterial · {GH_NEWS_PATH}). "
-                 "Results are cached for 1 hour.",
         )
     with gr.Accordion("Advanced Generation Parameters", open=False):
-        max_tokens    = gr.Slider(256, 2048, value=900,  step=64,   label="Max Tokens")
-        temperature   = gr.Slider(0.0, 1.0,  value=0.65, step=0.05, label="Temperature")
-        top_p         = gr.Slider(0.0, 1.0,  value=0.9,  step=0.05, label="Top-p")
-        repeat_penalty = gr.Slider(1.0, 2.0, value=1.1,  step=0.05, label="Repeat Penalty")
     gr.ChatInterface(
         fn=generate_response,
-        additional_inputs=[
-            rag_mode, article_filter,
-            use_morning_news,
-            max_tokens, temperature, top_p, repeat_penalty,
-        ],
-        cache_examples=False,   # prevents Gradio from running examples at startup
         examples=[
             ["What is the potential for Italy? /nothink"],
-            ["What is the potential for Turin? /nothink"],
         ],
     )
 if __name__ == "__main__":
-    # Local launch: share=False keeps it on localhost only.
-    # Set share=True if you want a temporary public Gradio tunnel.
-    demo.queue(default_concurrency_limit=1).launch(share=False)

 import gradio as gr
 from llama_cpp import Llama
 from huggingface_hub import hf_hub_download
 import os
 import pickle
 from langchain_huggingface import HuggingFaceEmbeddings
 # ====================== CONFIG ======================
+repo_id = "robertolofaro/articles-model"
 BACKENDS = {
     "FAISS - RAG (HNSW)": "FAISS",
+    "Qdrant - RAG": "Qdrant"
 }
+FAISS_PATH = "faiss_index_hnsw"
+QDRANT_PATH = "qdrant_db"
+QDRANT_COLLECTION = "articles"
+# ====================== LOAD METADATA FOR ARTICLE LIST ======================
+def load_articles_list():
     try:
         with open("metadata.pkl", "rb") as f:
             df = pickle.load(f)
+        articles = sorted(df['article_category'].unique().tolist())
+        return ["All categories"] + articles
+    except:
         return ["All categories"]
 ARTICLE_LIST = load_articles_list()
 # ====================== LOAD LLM ======================
+model_path = hf_hub_download(
+    repo_id=repo_id,
+    filename="articles-Q4_K_M.gguf",
+    repo_type="model",
+    token=os.environ.get("HF_TOKEN")
+)
+llm = Llama(
+    model_path=model_path,
+    n_ctx=4096,
+    n_threads=2,
+    n_batch=512,
+    n_ubatch=512,
+    verbose=False,
+)
+# ====================== RAG CACHE ======================
+vectorstores = {}
 def get_vectorstore(backend_name: str):
+    if backend_name in vectorstores:
+        return vectorstores[backend_name]
+    # ... (same loading logic as before - Chroma, FAISS, Qdrant) ...
+    # I'll keep it short here for brevity, but same as previous version
     try:
+        embeddings = HuggingFaceEmbeddings(model_name="BAAI/bge-small-en-v1.5", encode_kwargs={'normalize_embeddings': True})
         if backend_name == "FAISS":
             from langchain_community.vectorstores import FAISS
             vs = FAISS.load_local(FAISS_PATH, embeddings, allow_dangerous_deserialization=True)
         else:
             from langchain_community.vectorstores import FAISS
             vs = FAISS.load_local(FAISS_PATH, embeddings, allow_dangerous_deserialization=True)
+        vectorstores[backend_name] = vs
         return vs
+    except:
         return None
+# ====================== SYSTEM PROMPT ======================
+SYSTEM_PROMPT = """You are the reference expert for the articles contained in the training of this model, all extracted from the website robertolofaro.com, and all focused on change.
+#Your Mission:
+When a user asks a question, your goal is to provide a structured response based ONLY on the articles provided in your training. Do not provide general advice from outside these sources.
+# Response Format:
 1. Executive Summary: A 2-3 sentence overview answering the core query.
+2. Guidelines & Hints: A markdown list of specific "answers/guidelines/hints" found in the source material.
 """
 # ====================== GENERATION FUNCTION ======================
+def generate_response(message, history, rag_mode, article_filter, max_tokens, temperature, top_p, repeat_penalty):
+    full_prompt = f"<|im_start|>system\n{SYSTEM_PROMPT}<|im_end|>\n"
     for msg in history[-4:]:
         full_prompt += f"<|im_start|>{msg['role']}\n{msg['content']}<|im_end|>\n"
     backend = BACKENDS.get(rag_mode)
+    context = ""
     if backend:
         vs = get_vectorstore(backend)
         if vs:
             try:
+                filter_dict = {"article_category": article_filter} if article_filter != "All categories" else None
+                docs = vs.similarity_search(message, k=5, filter=filter_dict)
+                context = "\n\n".join([
+                    f"[Category: {doc.metadata.get('article_category', 'N/A')}] {doc.page_content[:700]}"
+                    for doc in docs
+                ])
+            except:
+                pass
     if context:
         full_prompt += f"<|im_start|>user\nContext:\n{context}\n\nQuestion: {message}<|im_end|>\n"
     full_prompt += "<|im_start|>assistant\n"
+    max_tokens_val = int(max_tokens) if max_tokens is not None else 900
+    temp_val = float(temperature) if temperature is not None else 0.65
+    top_p_val = float(top_p) if top_p is not None else 0.9
+    rep_penalty_val = float(repeat_penalty) if repeat_penalty is not None else 1.1
+    partial_text = ""
     for chunk in llm(
         full_prompt,
+        max_tokens=max_tokens_val,
+        temperature=temp_val,
+        top_p=top_p_val,
+        repeat_penalty=rep_penalty_val,
         stop=["<|im_end|>", "<|im_start|>"],
         stream=True,
     ):
+        token = chunk['choices'][0]['text']
+        partial_text += token
+        yield partial_text
 # ====================== GRADIO INTERFACE ======================
 with gr.Blocks(title="Article Q&A model") as demo:
     gr.Markdown("# sourcing 350+ articles on change")
+    gr.Markdown("Qwen3.5-4B DoRA fine-tuned on 350+ articles on change from robertolofaro.com - experimental on CPU-only, to test embedding methods (takes few minutes, no selection for the category yet) - updated as of 2026-05-05")
     with gr.Row():
         rag_mode = gr.Radio(
             choices=list(BACKENDS.keys()),
             value="FAISS - RAG (HNSW)",
+            label="Mode"
         )
         article_filter = gr.Dropdown(
             choices=ARTICLE_LIST,
             value="All categories",
+            label="Focus on category"
         )
     with gr.Accordion("Advanced Generation Parameters", open=False):
+        max_tokens = gr.Slider(256, 2048, value=900, step=64, label="Max Tokens")
+        temperature = gr.Slider(0.0, 1.0, value=0.65, step=0.05, label="Temperature")
+        top_p = gr.Slider(0.0, 1.0, value=0.9, step=0.05, label="Top-p")
+        repeat_penalty = gr.Slider(1.0, 2.0, value=1.1, step=0.05, label="Repeat Penalty")
     gr.ChatInterface(
         fn=generate_response,
+        additional_inputs=[rag_mode, article_filter, max_tokens, temperature, top_p, repeat_penalty],
+        cache_examples=False, # <--- Stops Gradio from executing them at startup
         examples=[
             ["What is the potential for Italy? /nothink"],
+            ["What is the potential for Turin? /nothink"]
         ],
     )
 if __name__ == "__main__":
+    demo.queue(default_concurrency_limit=1).launch()