Spaces:

robertolofaro
/

articles

Sleeping

App Files Files Community

robertolofaro commited on 6 days ago

Commit

e26d588

verified ·

1 Parent(s): 777360c

Upload app.py

Browse files

Files changed (1) hide show

app.py +355 -81

app.py CHANGED Viewed

@@ -1,62 +1,134 @@
 import gradio as gr
 from llama_cpp import Llama
 from huggingface_hub import hf_hub_download
 import os
 import pickle
 from langchain_huggingface import HuggingFaceEmbeddings
 # ====================== CONFIG ======================
-repo_id = "robertolofaro/articles-model"
 BACKENDS = {
     "FAISS - RAG (HNSW)": "FAISS",
-    "Qdrant - RAG": "Qdrant"
 }
-FAISS_PATH = "faiss_index_hnsw"
-QDRANT_PATH = "qdrant_db"
-QDRANT_COLLECTION = "articles"
-# ====================== LOAD METADATA FOR ARTICLE LIST ======================
-def load_articles_list():
     try:
         with open("metadata.pkl", "rb") as f:
             df = pickle.load(f)
-        articles = sorted(df['article_category'].unique().tolist())
-        return ["All categories"] + articles
-    except:
         return ["All categories"]
 ARTICLE_LIST = load_articles_list()
 # ====================== LOAD LLM ======================
-model_path = hf_hub_download(
-    repo_id=repo_id,
-    filename="articles-Q4_K_M.gguf",
-    repo_type="model",
-    token=os.environ.get("HF_TOKEN")
-)
-llm = Llama(
-    model_path=model_path,
-    n_ctx=4096,
-    n_threads=2,
-    n_batch=512,
-    n_ubatch=512,
-    verbose=False,
-)
-# ====================== RAG CACHE ======================
-vectorstores = {}
 def get_vectorstore(backend_name: str):
-    if backend_name in vectorstores:
-        return vectorstores[backend_name]
-    # ... (same loading logic as before - Chroma, FAISS, Qdrant) ...
-    # I'll keep it short here for brevity, but same as previous version
     try:
-        embeddings = HuggingFaceEmbeddings(model_name="BAAI/bge-small-en-v1.5", encode_kwargs={'normalize_embeddings': True})
         if backend_name == "FAISS":
             from langchain_community.vectorstores import FAISS
             vs = FAISS.load_local(FAISS_PATH, embeddings, allow_dangerous_deserialization=True)
@@ -66,45 +138,202 @@ def get_vectorstore(backend_name: str):
         else:
             from langchain_community.vectorstores import FAISS
             vs = FAISS.load_local(FAISS_PATH, embeddings, allow_dangerous_deserialization=True)
-        vectorstores[backend_name] = vs
         return vs
-    except:
         return None
-# ====================== SYSTEM PROMPT ======================
-SYSTEM_PROMPT = """You are the reference expert for the articles contained in the training of this model, all extracted from the website robertolofaro.com, and all focused on change.
-#Your Mission:
-When a user asks a question, your goal is to provide a structured response based ONLY on the articles provided in your training. Do not provide general advice from outside these sources.
-# Response Format:
 1. Executive Summary: A 2-3 sentence overview answering the core query.
-2. Guidelines & Hints: A markdown list of specific "answers/guidelines/hints" found in the source material.
 """
 # ====================== GENERATION FUNCTION ======================
-def generate_response(message, history, rag_mode, article_filter, max_tokens, temperature, top_p, repeat_penalty):
-    full_prompt = f"<|im_start|>system\n{SYSTEM_PROMPT}<|im_end|>\n"
     for msg in history[-4:]:
         full_prompt += f"<|im_start|>{msg['role']}\n{msg['content']}<|im_end|>\n"
-    backend = BACKENDS.get(rag_mode)
-    context = ""
     if backend:
         vs = get_vectorstore(backend)
         if vs:
             try:
-                filter_dict = {"article_category": article_filter} if article_filter != "All categories" else None
-                docs = vs.similarity_search(message, k=5, filter=filter_dict)
-                context = "\n\n".join([
-                    f"[Category: {doc.metadata.get('article_category', 'N/A')}] {doc.page_content[:700]}"
-                    for doc in docs
-                ])
-            except:
-                pass
     if context:
         full_prompt += f"<|im_start|>user\nContext:\n{context}\n\nQuestion: {message}<|im_end|>\n"
@@ -113,60 +342,105 @@ def generate_response(message, history, rag_mode, article_filter, max_tokens, te
     full_prompt += "<|im_start|>assistant\n"
-    max_tokens_val = int(max_tokens) if max_tokens is not None else 900
-    temp_val = float(temperature) if temperature is not None else 0.65
-    top_p_val = float(top_p) if top_p is not None else 0.9
-    rep_penalty_val = float(repeat_penalty) if repeat_penalty is not None else 1.1
-    partial_text = ""
     for chunk in llm(
         full_prompt,
-        max_tokens=max_tokens_val,
-        temperature=temp_val,
-        top_p=top_p_val,
-        repeat_penalty=rep_penalty_val,
         stop=["<|im_end|>", "<|im_start|>"],
         stream=True,
     ):
-        token = chunk['choices'][0]['text']
-        partial_text += token
-        yield partial_text
 # ====================== GRADIO INTERFACE ======================
 with gr.Blocks(title="Article Q&A model") as demo:
     gr.Markdown("# sourcing 350+ articles on change")
-    gr.Markdown("Qwen3.5-4B DoRA fine-tuned on 350+ articles on change from robertolofaro.com - experimental on CPU-only, to test embedding methods (takes few minutes, no selection for the category yet) - updated as of 2026-05-05")
-    gr.Markdown("NOTAM: a fair warning- by querying this model, you will access the articles and metadata that you can find also on robertolofaro.com and GitHub.")
-    gr.Markdown("Each article contains questions and answers, but only focused on the article- do not take any answer as advice, as your own context is not 'known' to the articles.")
-    gr.Markdown("If, after getting the answer, you would like something more contextualized, contact some consultants (myself included).")
     with gr.Row():
         rag_mode = gr.Radio(
             choices=list(BACKENDS.keys()),
             value="FAISS - RAG (HNSW)",
-            label="Mode"
         )
         article_filter = gr.Dropdown(
             choices=ARTICLE_LIST,
             value="All categories",
-            label="Focus on category"
         )
     with gr.Accordion("Advanced Generation Parameters", open=False):
-        max_tokens = gr.Slider(256, 2048, value=900, step=64, label="Max Tokens")
-        temperature = gr.Slider(0.0, 1.0, value=0.65, step=0.05, label="Temperature")
-        top_p = gr.Slider(0.0, 1.0, value=0.9, step=0.05, label="Top-p")
-        repeat_penalty = gr.Slider(1.0, 2.0, value=1.1, step=0.05, label="Repeat Penalty")
     gr.ChatInterface(
         fn=generate_response,
-        additional_inputs=[rag_mode, article_filter, max_tokens, temperature, top_p, repeat_penalty],
-        cache_examples=False, # <--- Stops Gradio from executing them at startup
         examples=[
             ["What is the potential for Italy? /nothink"],
-            ["What is the potential for Turin? /nothink"]
         ],
     )
 if __name__ == "__main__":
-    demo.queue(default_concurrency_limit=1).launch()

+"""
+app.py  –  Article Q&A chatbot
+Runs on:
+  • Hugging Face Spaces  (CPU-only, default)
+  • Local PC             (CPU or CUDA GPU)
+Environment variables
+---------------------
+HF_TOKEN            HuggingFace token for private model repo  (required on HF Space)
+LOCAL_MODE          Set to "1" to force local-PC behaviour     (optional; auto-detected via SPACE_ID)
+LOCAL_MODEL_PATH    Absolute path to the .gguf file on disk    (optional; skips HF hub download)
+GITHUB_TOKEN        GitHub PAT for higher rate-limits          (optional; works without it)
+N_THREADS           Override CPU thread count                  (optional)
+"""
 import gradio as gr
 from llama_cpp import Llama
 from huggingface_hub import hf_hub_download
 import os
 import pickle
+import requests
+from datetime import datetime, timedelta
 from langchain_huggingface import HuggingFaceEmbeddings
+# ====================== ENVIRONMENT DETECTION ======================
+# HuggingFace Spaces always set SPACE_ID; absent → we're running locally.
+IS_HF_SPACE = bool(os.environ.get("SPACE_ID"))
+IS_LOCAL    = (not IS_HF_SPACE) or (os.environ.get("LOCAL_MODE", "0") == "1")
+def _detect_cuda() -> bool:
+    """Return True only when a CUDA device is actually usable by llama-cpp."""
+    if not IS_LOCAL:
+        return False          # HF free tier is CPU-only
+    try:
+        import torch
+        return torch.cuda.is_available()
+    except ImportError:
+        pass
+    # Fallback: check for libcuda without torch
+    try:
+        import ctypes
+        ctypes.cdll.LoadLibrary("libcuda.so.1")
+        return True
+    except Exception:
+        return False
+CUDA_AVAILABLE = _detect_cuda()
+# -1  → offload every layer to GPU;  0 → pure CPU
+N_GPU_LAYERS   = -1 if CUDA_AVAILABLE else 0
+# Use all available cores locally; HF free tier: keep at 2 to avoid OOM
+N_THREADS      = int(os.environ.get("N_THREADS", os.cpu_count() if IS_LOCAL else 2))
 # ====================== CONFIG ======================
+REPO_ID            = "robertolofaro/articles-model"
+MODEL_FILENAME     = "articles-Q4_K_M.gguf"
 BACKENDS = {
     "FAISS - RAG (HNSW)": "FAISS",
+    "Qdrant - RAG":        "Qdrant",
 }
+FAISS_PATH         = "faiss_index_hnsw"
+QDRANT_PATH        = "qdrant_db"
+QDRANT_COLLECTION  = "articles"
+# MorningNews GitHub location
+GH_OWNER           = "robertolofaro"
+GH_REPO            = "supportmaterial"
+GH_NEWS_PATH       = "MorningNewsAgentTest"
+GH_API_ROOT        = "https://api.github.com"
+GH_RAW_ROOT        = "https://raw.githubusercontent.com"
+NEWS_ACCEPTED_EXT  = (".txt", ".md", ".json")
+NEWS_MAX_CHARS_FILE = 2000   # chars kept per file
+NEWS_MAX_CHARS_TOTAL = 3500  # total chars injected into prompt
+NEWS_CACHE_TTL     = timedelta(hours=1)
+# Web search
+WEB_MAX_RESULTS    = 5
+WEB_MAX_CHARS      = 2500    # total chars from web injected into prompt
+# ====================== LOAD METADATA ======================
+def load_articles_list() -> list[str]:
     try:
         with open("metadata.pkl", "rb") as f:
             df = pickle.load(f)
+        cats = sorted(df["article_category"].unique().tolist())
+        return ["All categories"] + cats
+    except Exception:
         return ["All categories"]
 ARTICLE_LIST = load_articles_list()
 # ====================== LOAD LLM ======================
+def _load_llm() -> Llama:
+    local_path = os.environ.get("LOCAL_MODEL_PATH", "")
+    if IS_LOCAL and local_path and os.path.isfile(local_path):
+        model_path = local_path
+        print(f"[LLM] Loading from local path: {model_path}")
+    else:
+        model_path = hf_hub_download(
+            repo_id=REPO_ID,
+            filename=MODEL_FILENAME,
+            repo_type="model",
+            token=os.environ.get("HF_TOKEN"),
+        )
+        print(f"[LLM] Downloaded from HF hub → {model_path}")
+    print(f"[LLM] n_gpu_layers={N_GPU_LAYERS}  n_threads={N_THREADS}  cuda={CUDA_AVAILABLE}")
+    return Llama(
+        model_path=model_path,
+        n_ctx=4096,
+        n_threads=N_THREADS,
+        n_batch=512,
+        n_ubatch=512,
+        n_gpu_layers=N_GPU_LAYERS,
+        verbose=False,
+    )
+llm = _load_llm()
+# ====================== RAG VECTORSTORE CACHE ======================
+_vectorstores: dict = {}
 def get_vectorstore(backend_name: str):
+    if backend_name in _vectorstores:
+        return _vectorstores[backend_name]
     try:
+        embeddings = HuggingFaceEmbeddings(
+            model_name="BAAI/bge-small-en-v1.5",
+            encode_kwargs={"normalize_embeddings": True},
+        )
         if backend_name == "FAISS":
             from langchain_community.vectorstores import FAISS
             vs = FAISS.load_local(FAISS_PATH, embeddings, allow_dangerous_deserialization=True)
         else:
             from langchain_community.vectorstores import FAISS
             vs = FAISS.load_local(FAISS_PATH, embeddings, allow_dangerous_deserialization=True)
+        _vectorstores[backend_name] = vs
         return vs
+    except Exception as e:
+        print(f"[RAG] Could not load vectorstore '{backend_name}': {e}")
         return None
+# ====================== MORNING NEWS FETCHER ======================
+_news_cache: dict = {"content": None, "fetched_at": None}
+def fetch_morning_news() -> str:
+    """
+    Fetch text/md/json files from the MorningNewsAgentTest directory on GitHub.
+    Results are cached for NEWS_CACHE_TTL to avoid hammering the API.
+    Works with or without a GITHUB_TOKEN (unauthenticated rate-limit: 60 req/hr).
+    """
+    global _news_cache
+    now = datetime.utcnow()
+    # Serve from cache if still fresh
+    if _news_cache["content"] is not None and _news_cache["fetched_at"]:
+        if now - _news_cache["fetched_at"] < NEWS_CACHE_TTL:
+            print("[MorningNews] Serving from cache")
+            return _news_cache["content"]
+    headers = {"Accept": "application/vnd.github.v3+json"}
+    gh_token = os.environ.get("GITHUB_TOKEN", "")
+    if gh_token:
+        headers["Authorization"] = f"token {gh_token}"
+    try:
+        # List files in the directory
+        dir_url = f"{GH_API_ROOT}/repos/{GH_OWNER}/{GH_REPO}/contents/{GH_NEWS_PATH}"
+        resp = requests.get(dir_url, headers=headers, timeout=10)
+        resp.raise_for_status()
+        entries = resp.json()
+        # Sort by name descending so the most recent file (date-prefixed) comes first
+        entries = sorted(
+            [e for e in entries if e["type"] == "file"
+             and e["name"].lower().endswith(NEWS_ACCEPTED_EXT)],
+            key=lambda e: e["name"],
+            reverse=True,
+        )
+        collected, total_chars = [], 0
+        for entry in entries:
+            if total_chars >= NEWS_MAX_CHARS_TOTAL:
+                break
+            raw_url = entry["download_url"]
+            try:
+                file_resp = requests.get(raw_url, headers=headers, timeout=10)
+                file_resp.raise_for_status()
+                snippet = file_resp.text[:NEWS_MAX_CHARS_FILE]
+                collected.append(f"--- [{entry['name']}] ---\n{snippet}")
+                total_chars += len(snippet)
+            except Exception as fe:
+                print(f"[MorningNews] Could not fetch {entry['name']}: {fe}")
+        combined = "\n\n".join(collected)[:NEWS_MAX_CHARS_TOTAL]
+        _news_cache = {"content": combined, "fetched_at": now}
+        print(f"[MorningNews] Fetched {len(collected)} file(s), {len(combined)} chars")
+        return combined
+    except Exception as e:
+        print(f"[MorningNews] Directory listing failed: {e}")
+        # Return stale cache rather than nothing if available
+        return _news_cache.get("content") or ""
+# ====================== WEB SEARCH (DuckDuckGo) ======================
+def search_web(query: str) -> str:
+    """
+    Search DuckDuckGo via duckduckgo-search and return a compact text block.
+    Gracefully degrades to an empty string if the package is missing or
+    the search fails (e.g. rate-limited on HF Spaces).
+    """
+    try:
+        from duckduckgo_search import DDGS
+    except ImportError:
+        print("[WebSearch] duckduckgo-search not installed – skipping")
+        return ""
+    try:
+        results = []
+        with DDGS() as ddgs:
+            for hit in ddgs.text(query, max_results=WEB_MAX_RESULTS):
+                title   = hit.get("title", "").strip()
+                body    = hit.get("body",  "").strip()[:400]
+                href    = hit.get("href",  "")
+                results.append(f"• {title}\n  {body}\n  ({href})")
+        combined = "\n\n".join(results)[:WEB_MAX_CHARS]
+        print(f"[WebSearch] {len(results)} result(s) for: {query[:60]}")
+        return combined
+    except Exception as e:
+        print(f"[WebSearch] Search failed: {e}")
+        return ""
+# ====================== SYSTEM PROMPTS ======================
+# Base prompt – articles only
+SYSTEM_PROMPT_BASE = """You are the reference expert for the articles contained in the training of this model, \
+all extracted from the website robertolofaro.com, and all focused on change.
+# Your Mission
+When a user asks a question, provide a structured response based ONLY on the articles in your training. \
+Do not provide general advice from outside these sources.
+# Response Format
+1. Executive Summary: A 2-3 sentence overview answering the core query.
+2. Guidelines & Hints: A markdown list of specific answers/guidelines/hints found in the source material.
+"""
+# Extended prompt – when extra sources are active
+SYSTEM_PROMPT_EXTENDED = """You are the reference expert for the articles contained in the training of this model, \
+all extracted from the website robertolofaro.com, and all focused on change. \
+You have also been provided with supplementary external context (morning news and/or web results).
+# Your Mission
+Provide a structured response that integrates all available information. \
+Clearly tag each insight with its source label so the reader can judge its provenance:
+  [Articles]     – insight from the trained article corpus
+  [MorningNews]  – insight from the morning news briefing
+  [Web]          – insight from live web search results
+# Response Format
 1. Executive Summary: A 2-3 sentence overview answering the core query.
+2. Guidelines & Hints: A markdown list of tagged insights from the source material.
+3. Additional Context (when MorningNews or Web results are present): \
+   brief synthesis of external findings relevant to the query.
 """
+# ====================== CONTEXT BUDGET HELPER ======================
+# Rough token estimate: 1 token ≈ 4 chars for English text.
+# n_ctx=4096 → reserve ~800 for answer, ~400 for system+history → ~2900 chars for context.
+CONTEXT_BUDGET_CHARS = 2900
+def _trim_to_budget(parts: list[tuple[str, str]]) -> str:
+    """
+    parts = [(label, text), ...]
+    Allocates the context budget proportionally across available sources,
+    then returns a single assembled context string.
+    """
+    # First pass: measure totals
+    totals = [(label, text) for label, text in parts if text.strip()]
+    if not totals:
+        return ""
+    per_source = CONTEXT_BUDGET_CHARS // len(totals)
+    sections = []
+    for label, text in totals:
+        trimmed = text[:per_source]
+        sections.append(f"=== {label} ===\n{trimmed}")
+    return "\n\n".join(sections)
 # ====================== GENERATION FUNCTION ======================
+def generate_response(
+    message, history,
+    rag_mode, article_filter,
+    use_morning_news, use_web_search,
+    max_tokens, temperature, top_p, repeat_penalty,
+):
+    has_extra = use_morning_news or use_web_search
+    system_prompt = SYSTEM_PROMPT_EXTENDED if has_extra else SYSTEM_PROMPT_BASE
+    full_prompt = f"<|im_start|>system\n{system_prompt}<|im_end|>\n"
+    # Keep the last 4 turns to limit context pressure
     for msg in history[-4:]:
         full_prompt += f"<|im_start|>{msg['role']}\n{msg['content']}<|im_end|>\n"
+    # ---- Gather context from all active sources ----
+    context_parts: list[tuple[str, str]] = []
+    # 1. RAG (vectorstore)
+    backend = BACKENDS.get(rag_mode)
     if backend:
         vs = get_vectorstore(backend)
         if vs:
             try:
+                filt = {"article_category": article_filter} if article_filter != "All categories" else None
+                docs = vs.similarity_search(message, k=5, filter=filt)
+                rag_text = "\n\n".join(
+                    f"[Cat: {d.metadata.get('article_category','N/A')}] {d.page_content[:700]}"
+                    for d in docs
+                )
+                context_parts.append(("ARTICLES CONTEXT", rag_text))
+            except Exception as e:
+                print(f"[RAG] similarity_search failed: {e}")
+    # 2. Morning News
+    if use_morning_news:
+        news = fetch_morning_news()
+        if news:
+            context_parts.append(("MORNING NEWS BRIEFING", news))
+    # 3. Web search
+    if use_web_search:
+        web = search_web(message)
+        if web:
+            context_parts.append(("WEB SEARCH RESULTS", web))
+    # ---- Assemble context within token budget ----
+    context = _trim_to_budget(context_parts)
     if context:
         full_prompt += f"<|im_start|>user\nContext:\n{context}\n\nQuestion: {message}<|im_end|>\n"
     full_prompt += "<|im_start|>assistant\n"
+    # ---- Inference parameters ----
+    max_tok  = int(max_tokens)    if max_tokens    is not None else 900
+    temp     = float(temperature) if temperature   is not None else 0.65
+    tp       = float(top_p)       if top_p         is not None else 0.9
+    rep_pen  = float(repeat_penalty) if repeat_penalty is not None else 1.1
+    partial = ""
     for chunk in llm(
         full_prompt,
+        max_tokens=max_tok,
+        temperature=temp,
+        top_p=tp,
+        repeat_penalty=rep_pen,
         stop=["<|im_end|>", "<|im_start|>"],
         stream=True,
     ):
+        partial += chunk["choices"][0]["text"]
+        yield partial
+# ====================== RUNTIME STATUS BADGE ======================
+def _build_status() -> str:
+    parts = []
+    if IS_HF_SPACE and not IS_LOCAL:
+        parts.append("☁️ HuggingFace Space · CPU-only")
+    else:
+        parts.append("🖥️ Local mode")
+        parts.append("⚡ GPU (CUDA)" if CUDA_AVAILABLE else "🐢 CPU-only")
+    parts.append(f"threads={N_THREADS}")
+    return "  |  ".join(parts)
+STATUS_LINE = _build_status()
 # ====================== GRADIO INTERFACE ======================
 with gr.Blocks(title="Article Q&A model") as demo:
     gr.Markdown("# sourcing 350+ articles on change")
+    gr.Markdown(
+        "Qwen3.5-4B DoRA fine-tuned on 350+ articles on change from robertolofaro.com — "
+        "experimental on CPU-only, to test embedding methods (takes a few minutes, "
+        "no selection for the category yet) — updated as of 2026-05-05"
+    )
+    gr.Markdown(f"**Runtime:** {STATUS_LINE}")
+    gr.Markdown(
+        "**NOTAM:** by querying this model you access the articles and metadata "
+        "available on robertolofaro.com and GitHub.  "
+        "Answers reflect the article corpus only — do not treat them as personal advice."
+    )
+    gr.Markdown(
+        "If, after getting an answer, you want something more contextualised, "
+        "contact a consultant (myself included)."
+    )
     with gr.Row():
         rag_mode = gr.Radio(
             choices=list(BACKENDS.keys()),
             value="FAISS - RAG (HNSW)",
+            label="Retrieval mode",
         )
         article_filter = gr.Dropdown(
             choices=ARTICLE_LIST,
             value="All categories",
+            label="Focus on category",
+        )
+    with gr.Row():
+        use_morning_news = gr.Checkbox(
+            value=False,
+            label="📰 Read MorningNews",
+            info="Supplement with the latest Morning News briefing fetched from GitHub "
+                 f"(robertolofaro/supportmaterial · {GH_NEWS_PATH}). "
+                 "Results are cached for 1 hour.",
+        )
+        use_web_search = gr.Checkbox(
+            value=False,
+            label="🔍 Search Web (DuckDuckGo)",
+            info="Complement the answer with live web search results via DuckDuckGo. "
+                 "Note: may be rate-limited on the free HF Space tier.",
         )
     with gr.Accordion("Advanced Generation Parameters", open=False):
+        max_tokens    = gr.Slider(256, 2048, value=900,  step=64,   label="Max Tokens")
+        temperature   = gr.Slider(0.0, 1.0,  value=0.65, step=0.05, label="Temperature")
+        top_p         = gr.Slider(0.0, 1.0,  value=0.9,  step=0.05, label="Top-p")
+        repeat_penalty = gr.Slider(1.0, 2.0, value=1.1,  step=0.05, label="Repeat Penalty")
     gr.ChatInterface(
         fn=generate_response,
+        additional_inputs=[
+            rag_mode, article_filter,
+            use_morning_news, use_web_search,
+            max_tokens, temperature, top_p, repeat_penalty,
+        ],
+        cache_examples=False,   # prevents Gradio from running examples at startup
         examples=[
             ["What is the potential for Italy? /nothink"],
+            ["What is the potential for Turin? /nothink"],
         ],
     )
 if __name__ == "__main__":
+    # Local launch: share=False keeps it on localhost only.
+    # Set share=True if you want a temporary public Gradio tunnel.
+    demo.queue(default_concurrency_limit=1).launch(share=False)