File size: 5,238 Bytes
603f5d9
 
 
 
 
 
 
 
fc18817
603f5d9
d5d6e44
 
 
603f5d9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2e67891
 
 
 
 
 
 
e84b2a5
 
f68e648
e84b2a5
 
 
 
 
 
 
 
 
 
603f5d9
 
 
 
 
e84b2a5
603f5d9
 
e84b2a5
603f5d9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e84b2a5
2e67891
 
603f5d9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e84b2a5
603f5d9
 
 
 
e84b2a5
603f5d9
 
 
 
 
 
 
 
 
e84b2a5
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
import os
import fitz
import tempfile
import requests
import streamlit as st
import pandas as pd
from bs4 import BeautifulSoup
from sentence_transformers import SentenceTransformer
from langchain_community.vectorstores import FAISS
from langchain.embeddings.base import Embeddings
from transformers import pipeline
from huggingface_hub import HfApi, HfFolder, login


# === Embeddings Wrapper ===
class SentenceTransformerEmbeddings(Embeddings):
    def __init__(self, model_name="all-MiniLM-L6-v2"):
        self.model = SentenceTransformer(model_name)

    def embed_documents(self, texts):
        return self.model.encode(texts).tolist()

    def embed_query(self, text):
        return self.model.encode([text])[0].tolist()

# === Utility Functions ===
def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    return "\n".join([page.get_text() for page in doc])

def split_text(text, chunk_size=500, overlap=50):
    chunks = []
    start = 0
    while start < len(text):
        end = min(start + chunk_size, len(text))
        chunks.append(text[start:end])
        start += chunk_size - overlap
    return chunks

def login_to_huggingface(api_key):
    try:
        HfFolder.save_token(api_key)
        st.success("βœ… Logged into Hugging Face successfully!")
    except Exception as e:
        st.error(f"❌ Failed to log in: {e}")

def ask_mistral(question, context, hf_api_key):
    # Load the Hugging Face Mistral model pipeline
    nlp = pipeline("question-answering", model="mistralai/Mistral-7B-v0.3", tokenizer="mistralai/Mistral-7B-v0.3", use_auth_token=hf_api_key)
    
    # Format the input
    inputs = {
        'context': context,
        'question': question
    }
    
    # Generate the answer using Mistral
    answer = nlp(inputs)
    return answer['answer']

def create_vectorstore(chunks):
    embeddings = SentenceTransformerEmbeddings()
    return FAISS.from_texts(chunks, embedding=embeddings)

def generate_answer(vectorstore, question, hf_api_key):
    docs = vectorstore.similarity_search(question, k=3)
    context = "\n".join([doc.page_content for doc in docs])
    return ask_mistral(question, context, hf_api_key), docs

def extract_website_text(url):
    try:
        res = requests.get(url, timeout=10)
        soup = BeautifulSoup(res.text, "html.parser")
        for script in soup(["script", "style"]):
            script.decompose()
        text = soup.get_text(separator="\n")
        return text.strip()
    except Exception as e:
        return f"Error extracting website: {e}"

# === Streamlit App ===
st.set_page_config(page_title="πŸ“š Multi-Source RAG Assistant", layout="wide")
st.title("πŸ” RAG Assistant: Chat with PDF, CSV, or Website")

# Sidebar
with st.sidebar:
    data_source = st.selectbox("πŸ“‚ Select Input Type", ["PDF", "CSV", "Website URL"])
    hf_api_key = st.text_input("πŸ”‘ Enter Hugging Face API Key", type="password")
    if hf_api_key:
        login_to_huggingface(hf_api_key)

# === Logic by Data Source ===
vectorstore = None
full_data_text = ""

if data_source == "PDF":
    pdf_file = st.file_uploader("πŸ“„ Upload PDF", type="pdf")
    if pdf_file:
        with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp:
            tmp.write(pdf_file.read())
            text = extract_text_from_pdf(tmp.name)
            chunks = split_text(text)
            vectorstore = create_vectorstore(chunks)
            full_data_text = text
        st.success("βœ… PDF processed and indexed!")

elif data_source == "CSV":
    csv_file = st.file_uploader("πŸ“Š Upload CSV", type="csv")
    if csv_file:
        df = pd.read_csv(csv_file)
        st.subheader("πŸ” Exploratory Data Analysis")
        st.dataframe(df)
        st.write("πŸ“ˆ Summary Statistics")
        st.write(df.describe(include="all").transpose())

        csv_text = df.to_string(index=False)
        chunks = split_text(csv_text)
        vectorstore = create_vectorstore(chunks)
        full_data_text = csv_text
        st.success("βœ… CSV indexed and ready for Q&A!")

elif data_source == "Website URL":
    url = st.text_input("🌐 Enter Website URL")
    if url and st.button("πŸ“₯ Extract Website"):
        web_text = extract_website_text(url)
        if web_text.startswith("Error"):
            st.error(web_text)
        else:
            chunks = split_text(web_text)
            vectorstore = create_vectorstore(chunks)
            full_data_text = web_text
            st.success("βœ… Website text extracted and indexed!")

# === QA Section ===
if vectorstore and hf_api_key:
    st.subheader("❓ Ask a Question")
    question = st.text_input("πŸ’¬ Your question")
    if question:
        with st.spinner("πŸ” Thinking..."):
            answer, top_docs = generate_answer(vectorstore, question, hf_api_key)
            st.success("🧠 Answer")
            st.write(answer)

            with st.expander("πŸ“Œ Top Relevant Chunks"):
                for i, doc in enumerate(top_docs):
                    st.markdown(f"**Chunk {i+1}:**\n```{doc.page_content}```")

            st.download_button("πŸ“€ Download Answer", answer, file_name="rag_answer.txt")

elif not hf_api_key:
    st.info("πŸ” Please enter your Hugging Face API key in the sidebar.")