Spaces:

GlitchGhost
/

Multi-Source-RAG-Assistant

Sleeping

File size: 5,238 Bytes

import os
import fitz
import tempfile
import requests
import streamlit as st
import pandas as pd
from bs4 import BeautifulSoup
from sentence_transformers import SentenceTransformer
from langchain_community.vectorstores import FAISS
from langchain.embeddings.base import Embeddings
from transformers import pipeline
from huggingface_hub import HfApi, HfFolder, login


# === Embeddings Wrapper ===
class SentenceTransformerEmbeddings(Embeddings):
    def __init__(self, model_name="all-MiniLM-L6-v2"):
        self.model = SentenceTransformer(model_name)

    def embed_documents(self, texts):
        return self.model.encode(texts).tolist()

    def embed_query(self, text):
        return self.model.encode([text])[0].tolist()

# === Utility Functions ===
def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    return "\n".join([page.get_text() for page in doc])

def split_text(text, chunk_size=500, overlap=50):
    chunks = []
    start = 0
    while start < len(text):
        end = min(start + chunk_size, len(text))
        chunks.append(text[start:end])
        start += chunk_size - overlap
    return chunks

def login_to_huggingface(api_key):
    try:
        HfFolder.save_token(api_key)
        st.success("✅ Logged into Hugging Face successfully!")
    except Exception as e:
        st.error(f"❌ Failed to log in: {e}")

def ask_mistral(question, context, hf_api_key):
    # Load the Hugging Face Mistral model pipeline
    nlp = pipeline("question-answering", model="mistralai/Mistral-7B-v0.3", tokenizer="mistralai/Mistral-7B-v0.3", use_auth_token=hf_api_key)
    
    # Format the input
    inputs = {
        'context': context,
        'question': question
    }
    
    # Generate the answer using Mistral
    answer = nlp(inputs)
    return answer['answer']

def create_vectorstore(chunks):
    embeddings = SentenceTransformerEmbeddings()
    return FAISS.from_texts(chunks, embedding=embeddings)

def generate_answer(vectorstore, question, hf_api_key):
    docs = vectorstore.similarity_search(question, k=3)
    context = "\n".join([doc.page_content for doc in docs])
    return ask_mistral(question, context, hf_api_key), docs

def extract_website_text(url):
    try:
        res = requests.get(url, timeout=10)
        soup = BeautifulSoup(res.text, "html.parser")
        for script in soup(["script", "style"]):
            script.decompose()
        text = soup.get_text(separator="\n")
        return text.strip()
    except Exception as e:
        return f"Error extracting website: {e}"

# === Streamlit App ===
st.set_page_config(page_title="📚 Multi-Source RAG Assistant", layout="wide")
st.title("🔍 RAG Assistant: Chat with PDF, CSV, or Website")

# Sidebar
with st.sidebar:
    data_source = st.selectbox("📂 Select Input Type", ["PDF", "CSV", "Website URL"])
    hf_api_key = st.text_input("🔑 Enter Hugging Face API Key", type="password")
    if hf_api_key:
        login_to_huggingface(hf_api_key)

# === Logic by Data Source ===
vectorstore = None
full_data_text = ""

if data_source == "PDF":
    pdf_file = st.file_uploader("📄 Upload PDF", type="pdf")
    if pdf_file:
        with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp:
            tmp.write(pdf_file.read())
            text = extract_text_from_pdf(tmp.name)
            chunks = split_text(text)
            vectorstore = create_vectorstore(chunks)
            full_data_text = text
        st.success("✅ PDF processed and indexed!")

elif data_source == "CSV":
    csv_file = st.file_uploader("📊 Upload CSV", type="csv")
    if csv_file:
        df = pd.read_csv(csv_file)
        st.subheader("🔍 Exploratory Data Analysis")
        st.dataframe(df)
        st.write("📈 Summary Statistics")
        st.write(df.describe(include="all").transpose())

        csv_text = df.to_string(index=False)
        chunks = split_text(csv_text)
        vectorstore = create_vectorstore(chunks)
        full_data_text = csv_text
        st.success("✅ CSV indexed and ready for Q&A!")

elif data_source == "Website URL":
    url = st.text_input("🌐 Enter Website URL")
    if url and st.button("📥 Extract Website"):
        web_text = extract_website_text(url)
        if web_text.startswith("Error"):
            st.error(web_text)
        else:
            chunks = split_text(web_text)
            vectorstore = create_vectorstore(chunks)
            full_data_text = web_text
            st.success("✅ Website text extracted and indexed!")

# === QA Section ===
if vectorstore and hf_api_key:
    st.subheader("❓ Ask a Question")
    question = st.text_input("💬 Your question")
    if question:
        with st.spinner("🔍 Thinking..."):
            answer, top_docs = generate_answer(vectorstore, question, hf_api_key)
            st.success("🧠 Answer")
            st.write(answer)

            with st.expander("📌 Top Relevant Chunks"):
                for i, doc in enumerate(top_docs):
                    st.markdown(f"**Chunk {i+1}:**\n```{doc.page_content}```")

            st.download_button("📤 Download Answer", answer, file_name="rag_answer.txt")

elif not hf_api_key:
    st.info("🔐 Please enter your Hugging Face API key in the sidebar.")