muddasser commited on
Commit
7a70581
Β·
verified Β·
1 Parent(s): 2751049

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +318 -189
app.py CHANGED
@@ -2,29 +2,13 @@ import streamlit as st
2
  import os
3
  import re
4
  import logging
 
5
  from playwright.sync_api import sync_playwright
6
  from langchain.text_splitter import RecursiveCharacterTextSplitter
7
  from langchain_community.vectorstores import FAISS
8
  from langchain_community.embeddings import HuggingFaceEmbeddings
9
  from langchain.schema import Document
10
 
11
- # Try importing transformers with fallback
12
- try:
13
- from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
14
- import transformers
15
- logging.info(f"Transformers version: {transformers.__version__}")
16
- except ImportError as e:
17
- st.error(f"Failed to import transformers: {str(e)}. Attempting fallback without pipeline.")
18
- logging.error(f"Transformers import failed: {str(e)}")
19
- try:
20
- from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
21
- import transformers
22
- logging.info(f"Fallback: Imported AutoTokenizer and AutoModelForSeq2SeqLM, version: {transformers.__version__}")
23
- except ImportError as e:
24
- st.error(f"Failed to import transformers fallback: {str(e)}. Please ensure transformers==4.44.2 and tokenizers==0.19.1 are installed.")
25
- logging.error(f"Transformers fallback import failed: {str(e)}")
26
- st.stop()
27
-
28
  # Set up logging
29
  logging.basicConfig(
30
  filename='/app/cache/app.log',
@@ -32,224 +16,369 @@ logging.basicConfig(
32
  format='%(asctime)s - %(levelname)s - %(message)s'
33
  )
34
 
35
- # Set page configuration
36
  st.set_page_config(
37
- page_title="Web Scraping + RAG Chatbot",
38
- page_icon="πŸ•·οΈ",
39
  layout="wide",
40
  initial_sidebar_state="expanded"
41
  )
42
 
43
- # App title and description
44
- st.title("πŸ•·οΈ Web Scraping + RAG Chatbot")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
  st.markdown("""
46
- This app combines web scraping with Retrieval-Augmented Generation (RAG) to create an intelligent chatbot.
47
- Enter a URL to scrape its content, then ask questions about the scraped data.
48
- """)
 
 
 
 
 
49
 
50
- # Initialize session state
51
  if 'scraped_content' not in st.session_state:
52
  st.session_state.scraped_content = ""
53
  if 'vector_store' not in st.session_state:
54
  st.session_state.vector_store = None
55
  if 'chat_history' not in st.session_state:
56
  st.session_state.chat_history = []
57
- if 'qa_pipeline' not in st.session_state:
58
- st.session_state.qa_pipeline = None
59
 
60
- def clean_text(text):
61
- """Clean and normalize scraped text."""
62
- try:
63
- text = re.sub(r'\s+', ' ', text)
64
- text = re.sub(r'[^\w\s.,!?;:]', ' ', text)
65
- return text.strip()
66
- except Exception as e:
67
- logging.error(f"Error cleaning text: {str(e)}")
68
- return text
69
 
70
- def scrape_website(url):
71
- """Scrape data from the given URL using Playwright."""
72
- logging.info(f"Starting scrape for URL: {url}")
 
 
 
 
 
 
 
 
 
 
73
  with sync_playwright() as p:
74
- browser = p.chromium.launch(headless=True, args=['--no-sandbox', '--disable-dev-shm-usage'])
 
 
 
75
  page = browser.new_page()
76
  try:
77
- logging.info(f"Navigating to {url}")
78
  page.goto(url, wait_until="domcontentloaded", timeout=30000)
79
  title = page.title()
80
- content_selectors = [
81
- "#content",
82
- ".mw-parser-output",
83
- "main",
84
- ".main-content",
85
- "#main",
86
- "article"
87
- ]
88
- main_content = None
89
- for selector in content_selectors:
90
  try:
91
- main_content = page.query_selector(selector)
92
- if main_content:
93
- logging.info(f"Found content with selector: {selector}")
94
  break
95
- except:
96
  continue
97
- if not main_content:
98
- main_content = page.query_selector("body")
99
- logging.info("Falling back to body tag for content")
100
- text_content = main_content.inner_text()
101
- cleaned_content = clean_text(text_content)
102
- logging.info(f"Scraped {len(cleaned_content)} characters from {url}")
103
- return {
104
- "title": title,
105
- "content": cleaned_content,
106
- "url": url
107
- }
108
  except Exception as e:
109
- logging.error(f"Error scraping {url}: {str(e)}")
110
- st.error(f"Error scraping {url}: {str(e)}")
111
  return None
112
  finally:
113
  browser.close()
114
 
115
- @st.cache_resource
116
- def initialize_qa_model():
117
- """Initialize the QA model with fallback."""
118
- if st.session_state.qa_pipeline is None:
119
- try:
120
- with st.spinner("Loading FLAN-T5 model..."):
121
- model_name = "google/flan-t5-small"
122
- tokenizer = AutoTokenizer.from_pretrained(model_name)
123
- model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
124
- try:
125
- st.session_state.qa_pipeline = pipeline(
126
- "text2text-generation",
127
- model=model,
128
- tokenizer=tokenizer,
129
- max_length=200
130
- )
131
- logging.info("Initialized QA pipeline successfully")
132
- except NameError:
133
- logging.warning("Pipeline not available, using raw model and tokenizer")
134
- st.session_state.qa_pipeline = (model, tokenizer)
135
- return st.session_state.qa_pipeline
136
- except Exception as e:
137
- st.error(f"Failed to load QA model: {str(e)}")
138
- logging.error(f"Error loading QA model: {str(e)}")
139
- return None
140
- return st.session_state.qa_pipeline
141
 
142
  @st.cache_resource
143
- def create_vector_store(text):
144
- """Create a FAISS vector store."""
145
  try:
146
- text_splitter = RecursiveCharacterTextSplitter(
147
- chunk_size=500,
148
- chunk_overlap=50,
149
- length_function=len
150
  )
151
- documents = [Document(page_content=chunk) for chunk in text_splitter.split_text(text)]
152
  embeddings = HuggingFaceEmbeddings(
153
  model_name="sentence-transformers/all-MiniLM-L6-v2",
154
  model_kwargs={'device': 'cpu'}
155
  )
156
- vector_store = FAISS.from_documents(documents, embeddings)
157
- logging.info("FAISS vector store created successfully")
158
- return vector_store
159
  except Exception as e:
160
- st.error(f"Error creating vector store: {str(e)}")
161
- logging.error(f"Error creating vector store: {str(e)}")
162
  return None
163
 
164
- def answer_question(question):
165
- """Answer a question using RAG with fallback."""
 
 
 
 
 
 
 
 
166
  if st.session_state.vector_store is None:
167
- return "Please scrape a website first."
168
- if st.session_state.qa_pipeline is None:
169
- return "QA model not loaded."
170
  try:
171
- relevant_docs = st.session_state.vector_store.similarity_search(question, k=3)
172
- context = " ".join([doc.page_content for doc in relevant_docs])
173
- prompt = f"""
174
- Based on the context, answer the question. If the answer is not in the context, say "I don't know".
175
- Context: {context}
176
- Question: {question}
177
- Answer:
178
- """
179
- if isinstance(st.session_state.qa_pipeline, tuple):
180
- # Fallback: Use raw model and tokenizer
181
- model, tokenizer = st.session_state.qa_pipeline
182
- inputs = tokenizer(prompt, return_tensors="pt", max_length=512, truncation=True)
183
- outputs = model.generate(**inputs, max_length=200, do_sample=False, temperature=0.3)
184
- answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
185
- else:
186
- # Use pipeline
187
- result = st.session_state.qa_pipeline(
188
- prompt,
189
- max_length=200,
190
- do_sample=False,
191
- temperature=0.3
192
- )
193
- answer = result[0]['generated_text']
194
- return answer.strip()
 
 
195
  except Exception as e:
196
- logging.error(f"Error answering question: {str(e)}")
197
- return f"Error generating answer: {str(e)}"
198
 
199
- def is_valid_url(url):
200
- """Validate URL format."""
201
- pattern = r'^https?://[\w\-\.]+(?:\:\d+)?(?:/[\w\-\./]*)*$'
202
- return bool(re.match(pattern, url))
203
 
204
- # Sidebar navigation
205
- st.sidebar.title("Navigation")
206
- app_mode = st.sidebar.radio("Choose a mode", ["Web Scraping", "Chat with Content", "About"])
207
-
208
- if app_mode == "Web Scraping":
209
- st.header("🌐 Web Scraping")
210
- url = st.text_input("Enter URL to scrape", "https://example.com")
211
- if st.button("Scrape Website"):
212
- if url and is_valid_url(url):
213
- with st.spinner("Scraping website..."):
214
- result = scrape_website(url)
215
- if result:
216
- st.success(f"Successfully scraped: {result['title']}")
217
- st.session_state.scraped_content = result['content']
218
- with st.spinner("Indexing content..."):
219
- st.session_state.vector_store = create_vector_store(result['content'])
220
- initialize_qa_model()
221
- with st.expander("View scraped content"):
222
- st.text_area("Content", result['content'], height=300)
223
- else:
224
- st.error("Failed to scrape the website. Check logs at /app/cache/app.log.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
225
  else:
226
- st.warning("Please enter a valid URL (e.g., https://example.com).")
 
 
 
 
 
 
 
 
 
227
 
228
- elif app_mode == "Chat with Content":
229
- st.header("πŸ’¬ Chat with Scraped Content")
230
  if st.session_state.vector_store is None:
231
- st.info("Please scrape a website first to enable chatting.")
232
- st.stop()
233
- for message in st.session_state.chat_history:
234
- with st.chat_message(message["role"]):
235
- st.markdown(message["content"])
236
- if prompt := st.chat_input("Ask a question about the scraped content"):
237
- st.session_state.chat_history.append({"role": "user", "content": prompt})
238
- with st.chat_message("user"):
239
- st.markdown(prompt)
240
- with st.chat_message("assistant"):
241
- with st.spinner("Generating answer..."):
242
- answer = answer_question(prompt)
 
 
 
243
  st.markdown(answer)
244
- st.session_state.chat_history.append({"role": "assistant", "content": answer})
245
 
246
- elif app_mode == "About":
247
- st.header("ℹ️ About")
248
- st.markdown("""
249
- This app uses Playwright for web scraping, LangChain for vector storage with FAISS,
250
- and Hugging Face models for embeddings and question answering.
251
- - **Web Scraping**: Extracts text using headless Chromium via Playwright.
252
- - **RAG**: Indexes content with sentence-transformers and answers questions using FLAN-T5.
253
- - **Tech Stack**: Python, Streamlit, Playwright, LangChain, Hugging Face Transformers, FAISS.
254
- - **Docker**: Runs in a containerized environment.
255
- """)
 
2
  import os
3
  import re
4
  import logging
5
+ import requests
6
  from playwright.sync_api import sync_playwright
7
  from langchain.text_splitter import RecursiveCharacterTextSplitter
8
  from langchain_community.vectorstores import FAISS
9
  from langchain_community.embeddings import HuggingFaceEmbeddings
10
  from langchain.schema import Document
11
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
  # Set up logging
13
  logging.basicConfig(
14
  filename='/app/cache/app.log',
 
16
  format='%(asctime)s - %(levelname)s - %(message)s'
17
  )
18
 
19
+ # ── Page config ────────────────────────────────────────────────────────────────
20
  st.set_page_config(
21
+ page_title="RAG Chatbot Β· Mistral",
22
+ page_icon="πŸ•ΈοΈ",
23
  layout="wide",
24
  initial_sidebar_state="expanded"
25
  )
26
 
27
+ # ── Custom CSS ─────────────────────────────────────────────────────────────────
28
+ st.markdown("""
29
+ <style>
30
+ @import url('https://fonts.googleapis.com/css2?family=Syne:wght@400;700;800&family=DM+Mono:ital,wght@0,400;0,500;1,400&display=swap');
31
+
32
+ html, body, [class*="css"] {
33
+ font-family: 'DM Mono', monospace;
34
+ background-color: #0d0d0d;
35
+ color: #e8e2d4;
36
+ }
37
+
38
+ h1, h2, h3 {
39
+ font-family: 'Syne', sans-serif;
40
+ letter-spacing: -0.02em;
41
+ }
42
+
43
+ .stApp {
44
+ background: #0d0d0d;
45
+ }
46
+
47
+ /* Sidebar */
48
+ [data-testid="stSidebar"] {
49
+ background: #111111;
50
+ border-right: 1px solid #2a2a2a;
51
+ }
52
+
53
+ /* Inputs */
54
+ .stTextInput > div > div > input,
55
+ .stTextArea textarea {
56
+ background: #1a1a1a !important;
57
+ border: 1px solid #2e2e2e !important;
58
+ border-radius: 4px !important;
59
+ color: #e8e2d4 !important;
60
+ font-family: 'DM Mono', monospace !important;
61
+ }
62
+
63
+ /* Buttons */
64
+ .stButton > button {
65
+ background: #c8f135 !important;
66
+ color: #0d0d0d !important;
67
+ border: none !important;
68
+ border-radius: 4px !important;
69
+ font-family: 'Syne', sans-serif !important;
70
+ font-weight: 700 !important;
71
+ letter-spacing: 0.05em !important;
72
+ text-transform: uppercase !important;
73
+ padding: 0.5rem 1.5rem !important;
74
+ transition: all 0.15s ease !important;
75
+ }
76
+ .stButton > button:hover {
77
+ background: #d9ff45 !important;
78
+ transform: translateY(-1px);
79
+ box-shadow: 0 4px 20px rgba(200,241,53,0.3) !important;
80
+ }
81
+
82
+ /* Chat messages */
83
+ [data-testid="stChatMessage"] {
84
+ background: #161616 !important;
85
+ border: 1px solid #242424 !important;
86
+ border-radius: 6px !important;
87
+ margin-bottom: 0.5rem !important;
88
+ }
89
+
90
+ /* Chat input */
91
+ [data-testid="stChatInput"] textarea {
92
+ background: #1a1a1a !important;
93
+ border: 1px solid #2e2e2e !important;
94
+ color: #e8e2d4 !important;
95
+ font-family: 'DM Mono', monospace !important;
96
+ }
97
+
98
+ /* Status / info boxes */
99
+ .stAlert {
100
+ background: #1a1a1a !important;
101
+ border: 1px solid #2e2e2e !important;
102
+ border-radius: 4px !important;
103
+ }
104
+
105
+ /* Expander */
106
+ .streamlit-expanderHeader {
107
+ background: #161616 !important;
108
+ border: 1px solid #2a2a2a !important;
109
+ font-family: 'DM Mono', monospace !important;
110
+ }
111
+
112
+ /* Accent tag */
113
+ .tag {
114
+ display: inline-block;
115
+ background: #c8f135;
116
+ color: #0d0d0d;
117
+ font-family: 'Syne', sans-serif;
118
+ font-weight: 700;
119
+ font-size: 0.7rem;
120
+ letter-spacing: 0.1em;
121
+ text-transform: uppercase;
122
+ padding: 2px 8px;
123
+ border-radius: 2px;
124
+ margin-right: 6px;
125
+ }
126
+
127
+ .status-bar {
128
+ display: flex;
129
+ align-items: center;
130
+ gap: 8px;
131
+ padding: 10px 14px;
132
+ background: #161616;
133
+ border: 1px solid #242424;
134
+ border-radius: 4px;
135
+ margin-bottom: 1rem;
136
+ font-size: 0.8rem;
137
+ color: #888;
138
+ }
139
+
140
+ .status-dot {
141
+ width: 8px;
142
+ height: 8px;
143
+ border-radius: 50%;
144
+ background: #444;
145
+ }
146
+ .status-dot.active {
147
+ background: #c8f135;
148
+ box-shadow: 0 0 6px rgba(200,241,53,0.6);
149
+ }
150
+ </style>
151
+ """, unsafe_allow_html=True)
152
+
153
+ # ── Header ─────────────────────────────────────────────────────────────────────
154
  st.markdown("""
155
+ <div style="padding: 2rem 0 1rem 0;">
156
+ <span class="tag">RAG</span>
157
+ <h1 style="display:inline; font-size:2.2rem; color:#e8e2d4;">Web Scraper Γ— Mistral</h1>
158
+ <p style="color:#666; font-size:0.85rem; margin-top:0.5rem; font-family:'DM Mono',monospace;">
159
+ Scrape any URL β†’ index with FAISS β†’ chat with Mistral 7B via Ollama
160
+ </p>
161
+ </div>
162
+ """, unsafe_allow_html=True)
163
 
164
+ # ── Session state ──────────────────────────────────────────────────────────────
165
  if 'scraped_content' not in st.session_state:
166
  st.session_state.scraped_content = ""
167
  if 'vector_store' not in st.session_state:
168
  st.session_state.vector_store = None
169
  if 'chat_history' not in st.session_state:
170
  st.session_state.chat_history = []
171
+ if 'scraped_title' not in st.session_state:
172
+ st.session_state.scraped_title = None
173
 
174
+ # ── Ollama config ──────────────────────────────────────────────────────────────
175
+ OLLAMA_BASE_URL = os.getenv("OLLAMA_BASE_URL", "http://localhost:11434")
176
+ OLLAMA_MODEL = os.getenv("OLLAMA_MODEL", "mistral")
177
+
178
+ # ── Helpers ────────────────────────────────────────────────────────────────────
 
 
 
 
179
 
180
+ def clean_text(text: str) -> str:
181
+ text = re.sub(r'\s+', ' ', text)
182
+ text = re.sub(r'[^\w\s.,!?;:]', ' ', text)
183
+ return text.strip()
184
+
185
+
186
+ def is_valid_url(url: str) -> bool:
187
+ pattern = r'^https?://[\w\-\.]+(?::\d+)?(?:/[\w\-\./]*)*$'
188
+ return bool(re.match(pattern, url))
189
+
190
+
191
+ def scrape_website(url: str):
192
+ logging.info(f"Scraping: {url}")
193
  with sync_playwright() as p:
194
+ browser = p.chromium.launch(
195
+ headless=True,
196
+ args=['--no-sandbox', '--disable-dev-shm-usage']
197
+ )
198
  page = browser.new_page()
199
  try:
 
200
  page.goto(url, wait_until="domcontentloaded", timeout=30000)
201
  title = page.title()
202
+ selectors = ["#content", ".mw-parser-output", "main",
203
+ ".main-content", "#main", "article"]
204
+ el = None
205
+ for sel in selectors:
 
 
 
 
 
 
206
  try:
207
+ el = page.query_selector(sel)
208
+ if el:
 
209
  break
210
+ except Exception:
211
  continue
212
+ if not el:
213
+ el = page.query_selector("body")
214
+ text = clean_text(el.inner_text())
215
+ logging.info(f"Scraped {len(text)} chars")
216
+ return {"title": title, "content": text, "url": url}
 
 
 
 
 
 
217
  except Exception as e:
218
+ logging.error(f"Scrape error: {e}")
219
+ st.error(f"Scraping failed: {e}")
220
  return None
221
  finally:
222
  browser.close()
223
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
224
 
225
  @st.cache_resource
226
+ def create_vector_store(text: str):
 
227
  try:
228
+ splitter = RecursiveCharacterTextSplitter(
229
+ chunk_size=500, chunk_overlap=50, length_function=len
 
 
230
  )
231
+ docs = [Document(page_content=c) for c in splitter.split_text(text)]
232
  embeddings = HuggingFaceEmbeddings(
233
  model_name="sentence-transformers/all-MiniLM-L6-v2",
234
  model_kwargs={'device': 'cpu'}
235
  )
236
+ vs = FAISS.from_documents(docs, embeddings)
237
+ logging.info("Vector store created")
238
+ return vs
239
  except Exception as e:
240
+ logging.error(f"Vector store error: {e}")
241
+ st.error(f"Indexing failed: {e}")
242
  return None
243
 
244
+
245
+ def check_ollama() -> bool:
246
+ try:
247
+ r = requests.get(f"{OLLAMA_BASE_URL}/api/tags", timeout=3)
248
+ return r.status_code == 200
249
+ except Exception:
250
+ return False
251
+
252
+
253
+ def answer_question(question: str) -> str:
254
  if st.session_state.vector_store is None:
255
+ return "No content indexed yet β€” please scrape a website first."
 
 
256
  try:
257
+ docs = st.session_state.vector_store.similarity_search(question, k=3)
258
+ context = " ".join(d.page_content for d in docs)
259
+ prompt = (
260
+ "You are a helpful assistant. Answer the question using ONLY the "
261
+ "context below. If the answer is not in the context, say \"I don't know\".\n\n"
262
+ f"Context:\n{context}\n\n"
263
+ f"Question: {question}\n\n"
264
+ "Answer:"
265
+ )
266
+ payload = {
267
+ "model": OLLAMA_MODEL,
268
+ "messages": [{"role": "user", "content": prompt}],
269
+ "stream": False
270
+ }
271
+ resp = requests.post(
272
+ f"{OLLAMA_BASE_URL}/api/chat",
273
+ json=payload,
274
+ timeout=120
275
+ )
276
+ resp.raise_for_status()
277
+ return resp.json()["message"]["content"].strip()
278
+ except requests.exceptions.ConnectionError:
279
+ return (
280
+ "⚠️ Cannot reach Ollama. Make sure Ollama is running and "
281
+ f"`{OLLAMA_BASE_URL}` is accessible."
282
+ )
283
  except Exception as e:
284
+ logging.error(f"Answer error: {e}")
285
+ return f"Error generating answer: {e}"
286
 
 
 
 
 
287
 
288
+ # ── Sidebar ────────────────────────────────────────────────────────────────────
289
+ with st.sidebar:
290
+ st.markdown("<h3 style='font-family:Syne,sans-serif;'>Settings</h3>", unsafe_allow_html=True)
291
+
292
+ ollama_url = st.text_input("Ollama URL", value=OLLAMA_BASE_URL)
293
+ model_name = st.text_input("Model", value=OLLAMA_MODEL)
294
+ OLLAMA_BASE_URL = ollama_url
295
+ OLLAMA_MODEL = model_name
296
+
297
+ st.markdown("---")
298
+
299
+ # Ollama status
300
+ alive = check_ollama()
301
+ dot_class = "active" if alive else ""
302
+ status_text = "Ollama connected" if alive else "Ollama not found"
303
+ st.markdown(f"""
304
+ <div class="status-bar">
305
+ <div class="status-dot {dot_class}"></div>
306
+ <span>{status_text}</span>
307
+ </div>
308
+ """, unsafe_allow_html=True)
309
+
310
+ if st.session_state.scraped_title:
311
+ st.markdown(f"""
312
+ <div class="status-bar">
313
+ <div class="status-dot active"></div>
314
+ <span>Indexed: {st.session_state.scraped_title[:30]}…</span>
315
+ </div>
316
+ """, unsafe_allow_html=True)
317
+
318
+ st.markdown("---")
319
+ st.markdown("""
320
+ <div style='font-size:0.75rem; color:#555; font-family:"DM Mono",monospace;'>
321
+ <b style='color:#888;'>Stack</b><br>
322
+ Playwright Β· FAISS<br>
323
+ MiniLM embeddings<br>
324
+ Mistral 7B via Ollama
325
+ </div>
326
+ """, unsafe_allow_html=True)
327
+
328
+ if not alive:
329
+ st.markdown("""
330
+ <div style='font-size:0.75rem; color:#c8f135; margin-top:1rem;'>
331
+ To start Ollama:<br><br>
332
+ <code style='color:#aaa;'>ollama serve</code><br>
333
+ <code style='color:#aaa;'>ollama pull mistral</code>
334
+ </div>
335
+ """, unsafe_allow_html=True)
336
+
337
+ # ── Main tabs ──────────────────────────────────────────────────────────────────
338
+ tab1, tab2 = st.tabs(["🌐 Scrape", "πŸ’¬ Chat"])
339
+
340
+ # ── Tab 1: Scrape ──────────────────────────────────────────────────────────────
341
+ with tab1:
342
+ st.markdown("### Enter a URL to scrape and index")
343
+ url_input = st.text_input("URL", placeholder="https://en.wikipedia.org/wiki/Mistral_AI")
344
+
345
+ if st.button("Scrape & Index"):
346
+ if not url_input or not is_valid_url(url_input):
347
+ st.warning("Please enter a valid URL starting with http:// or https://")
348
  else:
349
+ with st.spinner("Scraping…"):
350
+ result = scrape_website(url_input)
351
+ if result:
352
+ st.session_state.scraped_content = result['content']
353
+ st.session_state.scraped_title = result['title']
354
+ with st.spinner("Building FAISS index…"):
355
+ st.session_state.vector_store = create_vector_store(result['content'])
356
+ st.success(f"βœ“ Indexed **{result['title']}** β€” {len(result['content']):,} characters")
357
+ with st.expander("Preview scraped text"):
358
+ st.text_area("", result['content'][:3000] + "…", height=250)
359
 
360
+ # ── Tab 2: Chat ────────────────────────────────────────────────────────────────
361
+ with tab2:
362
  if st.session_state.vector_store is None:
363
+ st.info("Scrape a website first (tab above), then come back to chat.")
364
+ else:
365
+ # Render history
366
+ for msg in st.session_state.chat_history:
367
+ with st.chat_message(msg["role"]):
368
+ st.markdown(msg["content"])
369
+
370
+ # New input
371
+ if prompt := st.chat_input("Ask anything about the scraped content…"):
372
+ st.session_state.chat_history.append({"role": "user", "content": prompt})
373
+ with st.chat_message("user"):
374
+ st.markdown(prompt)
375
+ with st.chat_message("assistant"):
376
+ with st.spinner("Mistral is thinking…"):
377
+ answer = answer_question(prompt)
378
  st.markdown(answer)
379
+ st.session_state.chat_history.append({"role": "assistant", "content": answer})
380
 
381
+ if st.session_state.chat_history:
382
+ if st.button("Clear chat"):
383
+ st.session_state.chat_history = []
384
+ st.rerun()