File size: 5,238 Bytes
603f5d9 fc18817 603f5d9 d5d6e44 603f5d9 2e67891 e84b2a5 f68e648 e84b2a5 603f5d9 e84b2a5 603f5d9 e84b2a5 603f5d9 e84b2a5 2e67891 603f5d9 e84b2a5 603f5d9 e84b2a5 603f5d9 e84b2a5 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 |
import os
import fitz
import tempfile
import requests
import streamlit as st
import pandas as pd
from bs4 import BeautifulSoup
from sentence_transformers import SentenceTransformer
from langchain_community.vectorstores import FAISS
from langchain.embeddings.base import Embeddings
from transformers import pipeline
from huggingface_hub import HfApi, HfFolder, login
# === Embeddings Wrapper ===
class SentenceTransformerEmbeddings(Embeddings):
def __init__(self, model_name="all-MiniLM-L6-v2"):
self.model = SentenceTransformer(model_name)
def embed_documents(self, texts):
return self.model.encode(texts).tolist()
def embed_query(self, text):
return self.model.encode([text])[0].tolist()
# === Utility Functions ===
def extract_text_from_pdf(pdf_path):
doc = fitz.open(pdf_path)
return "\n".join([page.get_text() for page in doc])
def split_text(text, chunk_size=500, overlap=50):
chunks = []
start = 0
while start < len(text):
end = min(start + chunk_size, len(text))
chunks.append(text[start:end])
start += chunk_size - overlap
return chunks
def login_to_huggingface(api_key):
try:
HfFolder.save_token(api_key)
st.success("β
Logged into Hugging Face successfully!")
except Exception as e:
st.error(f"β Failed to log in: {e}")
def ask_mistral(question, context, hf_api_key):
# Load the Hugging Face Mistral model pipeline
nlp = pipeline("question-answering", model="mistralai/Mistral-7B-v0.3", tokenizer="mistralai/Mistral-7B-v0.3", use_auth_token=hf_api_key)
# Format the input
inputs = {
'context': context,
'question': question
}
# Generate the answer using Mistral
answer = nlp(inputs)
return answer['answer']
def create_vectorstore(chunks):
embeddings = SentenceTransformerEmbeddings()
return FAISS.from_texts(chunks, embedding=embeddings)
def generate_answer(vectorstore, question, hf_api_key):
docs = vectorstore.similarity_search(question, k=3)
context = "\n".join([doc.page_content for doc in docs])
return ask_mistral(question, context, hf_api_key), docs
def extract_website_text(url):
try:
res = requests.get(url, timeout=10)
soup = BeautifulSoup(res.text, "html.parser")
for script in soup(["script", "style"]):
script.decompose()
text = soup.get_text(separator="\n")
return text.strip()
except Exception as e:
return f"Error extracting website: {e}"
# === Streamlit App ===
st.set_page_config(page_title="π Multi-Source RAG Assistant", layout="wide")
st.title("π RAG Assistant: Chat with PDF, CSV, or Website")
# Sidebar
with st.sidebar:
data_source = st.selectbox("π Select Input Type", ["PDF", "CSV", "Website URL"])
hf_api_key = st.text_input("π Enter Hugging Face API Key", type="password")
if hf_api_key:
login_to_huggingface(hf_api_key)
# === Logic by Data Source ===
vectorstore = None
full_data_text = ""
if data_source == "PDF":
pdf_file = st.file_uploader("π Upload PDF", type="pdf")
if pdf_file:
with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp:
tmp.write(pdf_file.read())
text = extract_text_from_pdf(tmp.name)
chunks = split_text(text)
vectorstore = create_vectorstore(chunks)
full_data_text = text
st.success("β
PDF processed and indexed!")
elif data_source == "CSV":
csv_file = st.file_uploader("π Upload CSV", type="csv")
if csv_file:
df = pd.read_csv(csv_file)
st.subheader("π Exploratory Data Analysis")
st.dataframe(df)
st.write("π Summary Statistics")
st.write(df.describe(include="all").transpose())
csv_text = df.to_string(index=False)
chunks = split_text(csv_text)
vectorstore = create_vectorstore(chunks)
full_data_text = csv_text
st.success("β
CSV indexed and ready for Q&A!")
elif data_source == "Website URL":
url = st.text_input("π Enter Website URL")
if url and st.button("π₯ Extract Website"):
web_text = extract_website_text(url)
if web_text.startswith("Error"):
st.error(web_text)
else:
chunks = split_text(web_text)
vectorstore = create_vectorstore(chunks)
full_data_text = web_text
st.success("β
Website text extracted and indexed!")
# === QA Section ===
if vectorstore and hf_api_key:
st.subheader("β Ask a Question")
question = st.text_input("π¬ Your question")
if question:
with st.spinner("π Thinking..."):
answer, top_docs = generate_answer(vectorstore, question, hf_api_key)
st.success("π§ Answer")
st.write(answer)
with st.expander("π Top Relevant Chunks"):
for i, doc in enumerate(top_docs):
st.markdown(f"**Chunk {i+1}:**\n```{doc.page_content}```")
st.download_button("π€ Download Answer", answer, file_name="rag_answer.txt")
elif not hf_api_key:
st.info("π Please enter your Hugging Face API key in the sidebar.")
|