Spaces:

aach456
/

Rah

Runtime error

App Files Files Community

aach456 commited on Aug 28, 2025

Commit

7c22e3c

verified ·

1 Parent(s): fc96ea6

Update app.py

Browse files

Files changed (1) hide show

app.py +143 -238

app.py CHANGED Viewed

@@ -1,46 +1,12 @@
-# app.py
-# Chat-style RAG app with Streamlit chat UI, FAISS retrieval, SentenceTransformers embeddings,
-# and an open Mistral-7B pipeline. All caches redirected to /tmp to avoid PermissionError.
-# ---------- Writable dirs BEFORE third-party imports ----------
-import os, glob, tempfile
-# Streamlit internal runtime dir -> /tmp (fixes PermissionError: '/.streamlit')
-ST_RT = os.environ.get("STREAMLIT_RUNTIME_DIR", "/tmp/.streamlit_runtime")
-try:
-    os.makedirs(ST_RT, exist_ok=True)
-except Exception:
-    ST_RT = tempfile.mkdtemp(prefix="st_runtime_")
-os.environ["STREAMLIT_RUNTIME_DIR"] = ST_RT
-# Hugging Face caches -> /tmp
-HF_HOME = os.environ.get("HF_HOME", "/tmp/hf_cache")
-try:
-    os.makedirs(HF_HOME, exist_ok=True)
-except Exception:
-    HF_HOME = tempfile.mkdtemp(prefix="hf_cache_")
-os.environ["HF_HOME"] = HF_HOME
-os.environ["TRANSFORMERS_CACHE"] = HF_HOME  # backward-compat; deprecation warning is harmless
-os.environ["SENTENCE_TRANSFORMERS_HOME"] = HF_HOME
-os.environ["HF_DATASETS_CACHE"] = HF_HOME
-os.environ["XDG_CACHE_HOME"] = HF_HOME
-os.environ["HF_HUB_DISABLE_SYMLINKS_WARNING"] = "1"
-# Clean stale locks
-locks_dir = os.path.join(HF_HOME, "hub", ".locks")
-if os.path.isdir(locks_dir):
-    for p in glob.glob(os.path.join(locks_dir, "*.lock")):
-        try:
-            os.remove(p)
-        except Exception:
-            pass
-# ---------- Imports AFTER env is set ----------
 import io
 import time
-import pandas as pd
 import numpy as np
 import requests
-import streamlit as st
 from bs4 import BeautifulSoup
 from PyPDF2 import PdfReader
 from docx import Document
@@ -49,222 +15,161 @@ from sentence_transformers import SentenceTransformer
 from transformers import AutoConfig, AutoTokenizer, AutoModelForCausalLM, pipeline
 import faiss
-# ---------- Page ----------
-st.set_page_config(page_title="Chat RAG • Open Model + URLs", layout="wide")
-st.title("💬 Chat RAG with Open Model, FAISS, and Web URLs")
-# ---------- Session ----------
-for key, default in [
-    ("messages", []),
-    ("chunks", []),
-    ("embedder", None),
-    ("faiss_index", None),
-]:
-    if key not in st.session_state:
-        st.session_state[key] = default
-# ---------- Loaders ----------
-def load_txt(file):
-    raw = file.read()
-    for enc in ("utf-8", "latin-1"):
-        try:
-            return [{"source": file.name, "text": raw.decode(enc, errors="ignore")}]
-        except Exception:
-            continue
-    return [{"source": file.name, "text": raw.decode("utf-8", errors="ignore")}]
-def load_pdf(file):
-    pdf = PdfReader(file)
-    text = ""
-    for page in pdf.pages:
-        text += page.extract_text() or ""
-    return [{"source": file.name, "text": text}]
-def load_docx(file):
-    data = file.read()
-    doc = Document(io.BytesIO(data))
-    text = " ".join(p.text for p in doc.paragraphs)
-    return [{"source": file.name, "text": text}]
-def load_csv(file):
-    data = file.read()
-    df = None
-    for enc in ("utf-8", "latin-1"):
-        try:
-            df = pd.read_csv(io.BytesIO(data), encoding=enc)
-            break
-        except Exception:
-            df = None
-    if df is None:
-        try:
-            df = pd.read_csv(io.BytesIO(data), engine="python")
-        except Exception:
-            df = pd.DataFrame()
-    text = " ".join(df.astype(str).values.flatten().tolist()) if not df.empty else ""
-    return [{"source": file.name, "text": text}]
-def load_documents(files):
-    docs = []
-    for file in files or []:
-        name = file.name.lower()
-        if name.endswith(".pdf"):
-            docs += load_pdf(file)
-        elif name.endswith(".docx"):
-            docs += load_docx(file)
-        elif name.endswith(".csv"):
-            docs += load_csv(file)
-        elif name.endswith(".txt"):
-            docs += load_txt(file)
-    return docs
-# ---------- Web fetch ----------
-def fetch_web_text(url, timeout=12, retries=2, backoff=1.5):
-    for attempt in range(retries + 1):
-        try:
-            headers = {
-                "User-Agent": (
-                    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
-                    "AppleWebKit/537.36 (KHTML, like Gecko) "
-                    "Chrome/124.0 Safari/537.36"
-                )
-            }
-            resp = requests.get(url, headers=headers, timeout=timeout)
-            resp.raise_for_status()
-            soup = BeautifulSoup(resp.text, "html.parser")
-            for tag in soup(["script", "style", "noscript"]):
-                tag.decompose()
-            text = " ".join(soup.get_text(separator=" ").split())
-            return [{"source": url, "text": text}]
-        except Exception:
-            if attempt < retries:
-                time.sleep(backoff ** attempt)
-            else:
-                return []
-# ---------- Chunking ----------
-def chunk_documents(docs, chunk_size=1000, chunk_overlap=120):
     splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
     chunks = []
     for doc in docs:
-        splits = splitter.split_text(doc.get("text", "") or "")
         for idx, chunk in enumerate(splits):
             chunks.append({"source": doc["source"], "chunk_id": f"{doc['source']}_chunk{idx}", "content": chunk})
     return chunks
-# ---------- Embeddings / Index ----------
-@st.cache_resource(show_spinner=False)
-def load_embedder():
-    return SentenceTransformer("all-MiniLM-L6-v2", cache_folder=os.environ.get("SENTENCE_TRANSFORMERS_HOME", HF_HOME))
-def build_embeddings_index(chunks):
-    embedder = load_embedder()
-    texts = [c["content"] for c in chunks]
-    if not texts:
-        return embedder, None
-    emb = embedder.encode(texts, show_progress_bar=True, convert_to_numpy=True)
-    emb = np.asarray(emb, dtype="float32")
-    idx = faiss.IndexFlatL2(emb.shape[14])
-    idx.add(emb)
-    return embedder, idx
-def retrieve(query, embedder, index, chunks, top_k=4):
-    if index is None or not chunks:
         return []
     q_emb = embedder.encode([query], convert_to_numpy=True)
-    q_emb = np.asarray(q_emb, dtype="float32")
     distances, indices = index.search(q_emb, top_k)
-    out = []
-    for pos, i in enumerate(indices):
-        if i >= 0 and i < len(chunks):
-            out.append({"chunk": chunks[i], "score": float(distances[pos])})
-    return out
-# ---------- LLM ----------
-MODEL_ID = "MehdiHosseiniMoghadam/AVA-Mistral-7B-V2"
-@st.cache_resource(show_spinner=False)
-def load_llm():
-    cache_dir = os.environ.get("HF_HOME", HF_HOME)
-    _ = AutoConfig.from_pretrained(MODEL_ID, cache_dir=cache_dir, trust_remote_code=True)
-    tok = AutoTokenizer.from_pretrained(MODEL_ID, cache_dir=cache_dir, trust_remote_code=True)
-    model = AutoModelForCausalLM.from_pretrained(MODEL_ID, cache_dir=cache_dir, trust_remote_code=True)
-    return pipeline("text-generation", model=model, tokenizer=tok, max_length=1024, do_sample=True, temperature=0.2, trust_remote_code=True, device_map="auto")
-def answer_with_llm(context_chunks, query, llm):
     context_text = "\n".join(f"[{c['chunk_id']}] {c['content']}" for c in context_chunks)
     prompt = (
         "Answer the following question using ONLY the provided context and cite the chunk ids used.\n"
-        f"Question: {query}\n"
-        "Context:\n"
-        f"{context_text}\n"
-        "Answer with citations:"
     )
-    out = llm(prompt, max_length=512, num_return_sequences=1)
-    return out["generated_text"]
-# ---------- Sidebar sources ----------
-st.sidebar.header("Data sources")
-uploaded_files = st.sidebar.file_uploader(
-    "Upload documents (PDF, DOCX, TXT, CSV)",
-    type=["pdf", "txt", "docx", "csv"],
-    accept_multiple_files=True,
-    help="Default per-file limit ~200MB; increase via .streamlit/config.toml if needed.",
-)
-with st.sidebar.expander("Upload debug"):
-    info = {
-        "type": type(uploaded_files).__name__,
-        "num_files": (len(uploaded_files) if isinstance(uploaded_files, list) else (1 if uploaded_files else 0)),
-        "names": ([f.name for f in uploaded_files] if isinstance(uploaded_files, list) else ([uploaded_files.name] if uploaded_files else [])),
-    }
-    st.write(info)
-url_input = st.sidebar.text_area("Web URLs (one per line)", value="", height=120)
-web_docs = []
-if url_input.strip():
-    urls = [u.strip() for u in url_input.splitlines() if u.strip()]
-    with st.sidebar.spinner("Fetching web content..."):
-        for u in urls:
-            web_docs += fetch_web_text(u)
-file_docs = load_documents(uploaded_files) if uploaded_files else []
-all_docs = file_docs + web_docs
-if all_docs:
-    st.success(f"{len(all_docs)} document(s) loaded from files and URLs.")
-    with st.spinner("Chunking and embedding..."):
-        st.session_state.chunks = chunk_documents(all_docs, chunk_size=1000, chunk_overlap=120)
-        st.session_state.embedder, st.session_state.faiss_index = build_embeddings_index(st.session_state.chunks)
-    st.write(f"{len(st.session_state.chunks)} chunks created and indexed.")
-else:
-    st.info("Add documents or URLs in the sidebar to start.")
-# ---------- Chat UI ----------
-for m in st.session_state.messages:
-    with st.chat_message(m["role"]):
-        st.markdown(m["content"])
-user_input = st.chat_input("Ask about the loaded documents...")
-if user_input:
-    st.session_state.messages.append({"role": "user", "content": user_input})
-    with st.chat_message("user"):
-        st.markdown(user_input)
-    with st.chat_message("assistant"):
-        with st.spinner("Thinking..."):
-            if st.session_state.chunks:
-                llm = load_llm()
-                results = retrieve(user_input, st.session_state.embedder, st.session_state.faiss_index, st.session_state.chunks, top_k=4)
-                context_chunks = [r["chunk"] for r in results]
-                answer = answer_with_llm(context_chunks, user_input, llm)
-                st.markdown(answer)
-                sources = "\n".join(f"[{r['chunk']['chunk_id']} from {r['chunk']['source']}]" for r in results) or "No sources (no matches)."
-                with st.expander("Sources"):
-                    st.code(sources)
-            else:
-                answer = "No documents indexed yet. Add files or URLs in the sidebar and try again."
-                st.warning(answer)
-    st.session_state.messages.append({"role": "assistant", "content": answer})
-st.caption("Chat RAG • Mistral-7B (open), FAISS, SentenceTransformers, and Web URLs • Streamlit chat UI")

+import os
 import io
+import glob
+import tempfile
 import time
 import numpy as np
+import pandas as pd
 import requests
+import gradio as gr
 from bs4 import BeautifulSoup
 from PyPDF2 import PdfReader
 from docx import Document
 from transformers import AutoConfig, AutoTokenizer, AutoModelForCausalLM, pipeline
 import faiss
+# Setup HF cache paths before imports
+HF_HOME = os.environ.get("HF_HOME", "/tmp/hf_cache")
+os.makedirs(HF_HOME, exist_ok=True)
+os.environ["HF_HOME"] = HF_HOME
+os.environ["TRANSFORMERS_CACHE"] = HF_HOME
+os.environ["SENTENCE_TRANSFORMERS_HOME"] = HF_HOME
+os.environ["HF_DATASETS_CACHE"] = HF_HOME
+os.environ["XDG_CACHE_HOME"] = HF_HOME
+os.environ["HF_HUB_DISABLE_SYMLINKS_WARNING"] = "1"
+locks_dir = os.path.join(HF_HOME, "hub", ".locks")
+if os.path.isdir(locks_dir):
+    for p in glob.glob(os.path.join(locks_dir, "*.lock")):
+        try: os.remove(p)
+        except: pass
+MODEL_ID = "MehdiHosseiniMoghadam/AVA-Mistral-7B-V2"
+embedder = SentenceTransformer("all-MiniLM-L6-v2", cache_folder=HF_HOME)
+config = AutoConfig.from_pretrained(MODEL_ID, cache_dir=HF_HOME, trust_remote_code=True)
+tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, cache_dir=HF_HOME, trust_remote_code=True)
+model = AutoModelForCausalLM.from_pretrained(MODEL_ID, cache_dir=HF_HOME, trust_remote_code=True)
+llm = pipeline("text-generation", model=model, tokenizer=tokenizer,
+               max_length=1024, do_sample=True, temperature=0.2,
+               trust_remote_code=True, device_map="auto")
+def load_file_text(file):
+    name = file.name.lower()
+    if name.endswith(".pdf"):
+        reader = PdfReader(file)
+        text = "".join(page.extract_text() or "" for page in reader.pages)
+        return text
+    elif name.endswith(".docx"):
+        data = file.read()
+        doc = Document(io.BytesIO(data))
+        return " ".join(p.text for p in doc.paragraphs)
+    elif name.endswith(".csv"):
+        data = file.read()
+        for enc in ("utf-8", "latin-1"):
+            try:
+                df = pd.read_csv(io.BytesIO(data), encoding=enc)
+                return " ".join(df.astype(str).values.flatten().tolist())
+            except: pass
+        return ""
+    elif name.endswith(".txt"):
+        raw = file.read()
+        for enc in ("utf-8", "latin-1"):
+            try: return raw.decode(enc, errors="ignore")
+            except: continue
+        return raw.decode("utf-8", errors="ignore")
+    else:
+        return ""
+def fetch_web_text(url):
+    try:
+        headers = {'User-Agent': 'Mozilla/5.0'}
+        resp = requests.get(url, headers=headers, timeout=10)
+        resp.raise_for_status()
+        soup = BeautifulSoup(resp.text, "html.parser")
+        for tag in soup(["script", "style", "noscript"]):
+            tag.decompose()
+        return " ".join(soup.get_text(separator=" ").split())
+    except Exception:
+        return ""
+def chunk_docs(docs, chunk_size=1000, chunk_overlap=120):
     splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
     chunks = []
     for doc in docs:
+        splits = splitter.split_text(doc["text"])
         for idx, chunk in enumerate(splits):
             chunks.append({"source": doc["source"], "chunk_id": f"{doc['source']}_chunk{idx}", "content": chunk})
     return chunks
+def build_index_and_chunks(docs):
+    chunks = chunk_docs(docs)
+    texts = [chunk["content"] for chunk in chunks]
+    if len(texts) == 0: return None, []
+    embeddings = embedder.encode(texts, show_progress_bar=True, convert_to_numpy=True)
+    embeddings = np.asarray(embeddings).astype("float32")
+    dim = embeddings.shape[1]
+    index = faiss.IndexFlatL2(dim)
+    index.add(embeddings)
+    return index, chunks
+def retrieve(query, index, chunks, top_k=3):
+    if index is None or len(chunks) == 0:
         return []
     q_emb = embedder.encode([query], convert_to_numpy=True)
+    q_emb = np.asarray(q_emb).astype("float32")
     distances, indices = index.search(q_emb, top_k)
+    results = []
+    for dist, idx in zip(distances[0], indices[0]):
+        if idx >= 0 and idx < len(chunks):
+            results.append({"chunk": chunks[idx], "score": float(dist)})
+    return results
+def answer_question(query, index, chunks):
+    results = retrieve(query, index, chunks)
+    context_chunks = [r["chunk"] for r in results]
     context_text = "\n".join(f"[{c['chunk_id']}] {c['content']}" for c in context_chunks)
     prompt = (
         "Answer the following question using ONLY the provided context and cite the chunk ids used.\n"
+        f"Question: {query}\nContext:\n{context_text}\nAnswer with citations:"
     )
+    generated = llm(prompt, max_length=512, num_return_sequences=1)
+    return generated[0]["generated_text"], "\n".join(f"[{c['chunk_id']} from {c['source']}]" for c in context_chunks)
+state = {"index": None, "chunks": []}
+def process(files, urls):
+    docs = []
+    if files:
+        for f in files:
+            text = load_file_text(f)
+            if text:
+                docs.append({"source": f.name, "text": text})
+    if urls:
+        for url in urls.strip().splitlines():
+            text = fetch_web_text(url.strip())
+            if text:
+                docs.append({"source": url.strip(), "text": text})
+    if len(docs) == 0:
+        return "No documents or URLs loaded."
+    index, chunks = build_index_and_chunks(docs)
+    state["index"], state["chunks"] = index, chunks
+    return f"Loaded {len(docs)} docs, created {len(chunks)} chunks."
+def chat_response(user_message, history):
+    if state["index"] is None or len(state["chunks"]) == 0:
+        bot_message = "Please upload documents or enter URLs, then press 'Load & Process' first."
+    else:
+        answer, sources = answer_question(user_message, state["index"], state["chunks"])
+        bot_message = answer + "\n\nSources:\n" + sources
+    history = history or []
+    history.append(("User: " + user_message, "Assistant: " + bot_message))
+    return "", history
+with gr.Blocks() as demo:
+    gr.Markdown("# 📚 RAG Chatbot with Mistral-7B and FAISS")
+    with gr.Row():
+        with gr.Column(scale=1):
+            file_input = gr.File(label="Upload Files (PDF, DOCX, TXT, CSV)", file_types=[".pdf", ".docx", ".txt", ".csv"], file_count="multiple")
+            url_input = gr.Textbox(label="Enter URLs (one per line)", lines=4)
+            process_button = gr.Button("Load & Process Documents and URLs")
+            output_log = gr.Textbox(label="Status")
+        with gr.Column(scale=2):
+            chatbot = gr.Chatbot()
+            user_input = gr.Textbox(placeholder="Ask a question about the loaded documents...", show_label=False)
+            submit_btn = gr.Button("Send")
+    process_button.click(process, inputs=[file_input, url_input], outputs=output_log)
+    submit_btn.click(chat_response, inputs=[user_input, chatbot], outputs=[user_input, chatbot])
+demo.launch()