GlitchGhost commited on
Commit
603f5d9
Β·
verified Β·
1 Parent(s): 72271e9

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +140 -0
app.py ADDED
@@ -0,0 +1,140 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import fitz
3
+ import tempfile
4
+ import requests
5
+ import streamlit as st
6
+ import pandas as pd
7
+ from bs4 import BeautifulSoup
8
+ from sentence_transformers import SentenceTransformer
9
+ from langchain.vectorstores.faiss import FAISS
10
+ from langchain.embeddings.base import Embeddings
11
+ import google.generativeai as genai
12
+
13
+ # === Embeddings Wrapper ===
14
+ class SentenceTransformerEmbeddings(Embeddings):
15
+ def __init__(self, model_name="all-MiniLM-L6-v2"):
16
+ self.model = SentenceTransformer(model_name)
17
+
18
+ def embed_documents(self, texts):
19
+ return self.model.encode(texts).tolist()
20
+
21
+ def embed_query(self, text):
22
+ return self.model.encode([text])[0].tolist()
23
+
24
+ # === Utility Functions ===
25
+ def extract_text_from_pdf(pdf_path):
26
+ doc = fitz.open(pdf_path)
27
+ return "\n".join([page.get_text() for page in doc])
28
+
29
+ def split_text(text, chunk_size=500, overlap=50):
30
+ chunks = []
31
+ start = 0
32
+ while start < len(text):
33
+ end = min(start + chunk_size, len(text))
34
+ chunks.append(text[start:end])
35
+ start += chunk_size - overlap
36
+ return chunks
37
+
38
+ def ask_gemini(question, context, api_key):
39
+ genai.configure(api_key=api_key)
40
+ model = genai.GenerativeModel("gemini-pro")
41
+ prompt = f"""You are a helpful assistant. Use the context below to answer the question.
42
+
43
+ Context:
44
+ {context}
45
+
46
+ Question: {question}
47
+ Answer:"""
48
+ response = model.generate_content(prompt)
49
+ return response.text
50
+
51
+ def create_vectorstore(chunks):
52
+ embeddings = SentenceTransformerEmbeddings()
53
+ return FAISS.from_texts(chunks, embedding=embeddings)
54
+
55
+ def generate_answer(vectorstore, question, api_key):
56
+ docs = vectorstore.similarity_search(question, k=3)
57
+ context = "\n".join([doc.page_content for doc in docs])
58
+ return ask_gemini(question, context, api_key), docs
59
+
60
+ def extract_website_text(url):
61
+ try:
62
+ res = requests.get(url, timeout=10)
63
+ soup = BeautifulSoup(res.text, "html.parser")
64
+ for script in soup(["script", "style"]):
65
+ script.decompose()
66
+ text = soup.get_text(separator="\n")
67
+ return text.strip()
68
+ except Exception as e:
69
+ return f"Error extracting website: {e}"
70
+
71
+ # === Streamlit App ===
72
+ st.set_page_config(page_title="πŸ“š Multi-Source RAG Assistant", layout="wide")
73
+ st.title("πŸ” RAG Assistant: Chat with PDF, CSV, or Website")
74
+
75
+ # Sidebar
76
+ with st.sidebar:
77
+ data_source = st.selectbox("πŸ“‚ Select Input Type", ["PDF", "CSV", "Website URL"])
78
+ gemini_api_key = st.text_input("πŸ”‘ Enter Gemini API Key", type="password")
79
+
80
+ # === Logic by Data Source ===
81
+ vectorstore = None
82
+ full_data_text = ""
83
+
84
+ if data_source == "PDF":
85
+ pdf_file = st.file_uploader("πŸ“„ Upload PDF", type="pdf")
86
+ if pdf_file:
87
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp:
88
+ tmp.write(pdf_file.read())
89
+ text = extract_text_from_pdf(tmp.name)
90
+ chunks = split_text(text)
91
+ vectorstore = create_vectorstore(chunks)
92
+ full_data_text = text
93
+ st.success("βœ… PDF processed and indexed!")
94
+
95
+ elif data_source == "CSV":
96
+ csv_file = st.file_uploader("πŸ“Š Upload CSV", type="csv")
97
+ if csv_file:
98
+ df = pd.read_csv(csv_file)
99
+ st.subheader("πŸ” Exploratory Data Analysis")
100
+ st.dataframe(df)
101
+ st.write("πŸ“ˆ Summary Statistics")
102
+ st.write(df.describe(include="all").transpose())
103
+
104
+ csv_text = df.to_string(index=False)
105
+ chunks = split_text(csv_text)
106
+ vectorstore = create_vectorstore(chunks)
107
+ full_data_text = csv_text
108
+ st.success("βœ… CSV indexed and ready for Q&A!")
109
+
110
+ elif data_source == "Website URL":
111
+ url = st.text_input("🌐 Enter Website URL")
112
+ if url and st.button("πŸ“₯ Extract Website"):
113
+ web_text = extract_website_text(url)
114
+ if web_text.startswith("Error"):
115
+ st.error(web_text)
116
+ else:
117
+ chunks = split_text(web_text)
118
+ vectorstore = create_vectorstore(chunks)
119
+ full_data_text = web_text
120
+ st.success("βœ… Website text extracted and indexed!")
121
+
122
+ # === QA Section ===
123
+ if vectorstore and gemini_api_key:
124
+ st.subheader("❓ Ask a Question")
125
+ question = st.text_input("πŸ’¬ Your question")
126
+ if question:
127
+ with st.spinner("πŸ” Thinking..."):
128
+ answer, top_docs = generate_answer(vectorstore, question, gemini_api_key)
129
+ st.success("🧠 Answer")
130
+ st.write(answer)
131
+
132
+ with st.expander("πŸ“Œ Top Relevant Chunks"):
133
+ for i, doc in enumerate(top_docs):
134
+ st.markdown(f"**Chunk {i+1}:**\n```{doc.page_content}```")
135
+
136
+ st.download_button("πŸ“€ Download Answer", answer, file_name="rag_answer.txt")
137
+
138
+ elif not gemini_api_key:
139
+ st.info("πŸ” Please enter your Gemini API key in the sidebar.")
140
+