pradeep4321 commited on
Commit
46325f0
Β·
verified Β·
1 Parent(s): 2f54431

Update src/streamlit_app.py

Browse files
Files changed (1) hide show
  1. src/streamlit_app.py +56 -160
src/streamlit_app.py CHANGED
@@ -1,61 +1,27 @@
1
  # =========================================================
2
- # 🌐 WEBSITE RAG + IMAGE QA (HF SPACES FIXED VERSION)
3
  # =========================================================
4
 
5
  import streamlit as st
6
  import requests
7
  from bs4 import BeautifulSoup
8
- import numpy as np
9
- import faiss
10
- import torch
11
- from PIL import Image
12
- from io import BytesIO
13
  from urllib.parse import urljoin
14
 
15
- from sentence_transformers import SentenceTransformer
16
- from transformers import pipeline, BlipProcessor, BlipForConditionalGeneration
17
-
18
  # ==============================
19
  # PAGE CONFIG
20
  # ==============================
21
- st.set_page_config(page_title="🌐 Website QA System", layout="wide")
22
-
23
- # ==============================
24
- # LOAD MODELS (FIXED)
25
- # ==============================
26
- @st.cache_resource
27
- def load_models():
28
- embed_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
29
-
30
- # βœ… FIX: use text-generation instead of text2text-generation
31
- qa_pipeline = pipeline(
32
- "text-generation",
33
- model="google/flan-t5-base",
34
- max_length=256,
35
- do_sample=False
36
- )
37
-
38
- processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
39
- image_model = BlipForConditionalGeneration.from_pretrained(
40
- "Salesforce/blip-image-captioning-base"
41
- )
42
-
43
- return embed_model, qa_pipeline, processor, image_model
44
-
45
- embed_model, qa_pipeline, processor, image_model = load_models()
46
 
47
  # ==============================
48
  # SESSION STATE
49
  # ==============================
50
- if "documents" not in st.session_state:
51
- st.session_state.documents = []
52
-
53
- if "index" not in st.session_state:
54
- st.session_state.index = None
55
-
56
  if "links" not in st.session_state:
57
  st.session_state.links = []
58
 
 
 
 
59
  # ==============================
60
  # CRAWL WEBSITE
61
  # ==============================
@@ -67,19 +33,19 @@ def crawl_website(url):
67
  links = set()
68
 
69
  for a in soup.find_all("a", href=True):
70
- link = urljoin(url, a["href"]) # βœ… FIX relative links
71
  if link.startswith("http"):
72
  links.add(link)
73
 
74
- return list(links)[:20]
75
 
76
- except Exception as e:
77
  return []
78
 
79
  # ==============================
80
- # EXTRACT CONTENT (TEXT + IMAGES)
81
  # ==============================
82
- def extract_content(url):
83
  try:
84
  res = requests.get(url, timeout=10)
85
  soup = BeautifulSoup(res.text, "html.parser")
@@ -88,65 +54,28 @@ def extract_content(url):
88
  paragraphs = [p.get_text().strip() for p in soup.find_all("p")]
89
  text = " ".join(paragraphs)
90
 
91
- # IMAGES β†’ CAPTION
92
- image_texts = []
93
- images = soup.find_all("img")
94
-
95
- for img in images[:5]: # limit
96
- try:
97
- img_url = urljoin(url, img.get("src"))
98
-
99
- img_res = requests.get(img_url, timeout=5)
100
- image = Image.open(BytesIO(img_res.content)).convert("RGB")
101
-
102
- inputs = processor(image, return_tensors="pt")
103
- out = image_model.generate(**inputs)
104
- caption = processor.decode(out[0], skip_special_tokens=True)
105
-
106
- image_texts.append(caption)
107
-
108
- except:
109
- continue
110
 
111
- return text + " " + " ".join(image_texts)
 
 
 
 
112
 
113
  except:
114
- return ""
115
-
116
- # ==============================
117
- # CHUNKING
118
- # ==============================
119
- def chunk_text(text, size=300):
120
- words = text.split()
121
- return [" ".join(words[i:i+size]) for i in range(0, len(words), size)]
122
-
123
- # ==============================
124
- # BUILD FAISS INDEX
125
- # ==============================
126
- def build_index(texts):
127
- embeddings = embed_model.encode(texts)
128
- dim = embeddings.shape[1]
129
-
130
- index = faiss.IndexFlatL2(dim)
131
- index.add(np.array(embeddings))
132
-
133
- return index
134
-
135
- # ==============================
136
- # ADD TO EXISTING INDEX
137
- # ==============================
138
- def add_to_index(new_chunks):
139
- new_embeddings = embed_model.encode(new_chunks)
140
- st.session_state.index.add(np.array(new_embeddings))
141
- st.session_state.documents.extend(new_chunks)
142
 
143
  # ==============================
144
  # UI
145
  # ==============================
146
- st.title("🌐 Website QA with Images (Fixed)")
147
 
148
  # ==============================
149
- # STEP 1: URL INPUT
150
  # ==============================
151
  url = st.text_input("πŸ”— Enter Website URL")
152
 
@@ -157,92 +86,59 @@ if st.button("Crawl Website"):
157
  st.session_state.links = links
158
  st.success(f"Found {len(links)} pages")
159
  else:
160
- st.error("No links found or invalid URL")
161
 
162
  # ==============================
163
- # STEP 2: PAGE SELECTION
164
  # ==============================
165
  selected_links = []
166
 
167
  if st.session_state.links:
168
- st.subheader("πŸ“„ Select Pages to Train")
169
 
170
  for link in st.session_state.links:
171
  if st.checkbox(link):
172
  selected_links.append(link)
173
 
174
- if st.button("Train Selected Pages"):
175
- all_chunks = []
176
-
177
- with st.spinner("Processing pages..."):
178
- for link in selected_links:
179
- content = extract_content(link)
180
- chunks = chunk_text(content)
181
- all_chunks.extend(chunks)
182
-
183
- if all_chunks:
184
- st.session_state.index = build_index(all_chunks)
185
- st.session_state.documents = all_chunks
186
-
187
- st.success("βœ… Training completed!")
188
- else:
189
- st.warning("No content extracted")
190
-
191
  # ==============================
192
- # STEP 3: ADD MORE PAGES
193
  # ==============================
194
- st.subheader("βž• Add More Pages")
195
-
196
- new_url = st.text_input("Enter another page URL")
197
 
198
- if st.button("Add & Train"):
199
- content = extract_content(new_url)
200
- chunks = chunk_text(content)
 
 
201
 
202
- if chunks:
203
- if st.session_state.index is None:
204
- st.session_state.index = build_index(chunks)
205
- st.session_state.documents = chunks
206
- else:
207
- add_to_index(chunks)
208
-
209
- st.success("βœ… Page added successfully!")
210
  else:
211
- st.error("Failed to extract content")
212
 
213
  # ==============================
214
- # STEP 4: ASK QUESTIONS
215
  # ==============================
216
- st.subheader("πŸ’¬ Ask Questions")
217
-
218
- query = st.text_input("Ask something from the website")
219
-
220
- if st.button("Get Answer"):
221
- if st.session_state.index is None:
222
- st.warning("⚠️ Please train pages first")
223
- else:
224
- q_embed = embed_model.encode([query])
225
-
226
- D, I = st.session_state.index.search(np.array(q_embed), k=5)
227
 
228
- context = " ".join([st.session_state.documents[i] for i in I[0]])
 
229
 
230
- prompt = f"""
231
- Answer based only on the context.
232
-
233
- Context:
234
- {context}
235
-
236
- Question:
237
- {query}
238
-
239
- Answer:
240
- """
241
 
242
- response = qa_pipeline(prompt)[0]["generated_text"]
243
 
244
- # βœ… CLEAN OUTPUT
245
- answer = response.replace(prompt, "").strip()
 
246
 
247
- st.write("### βœ… Answer")
248
- st.write(answer if answer else "No relevant answer found")
 
 
1
  # =========================================================
2
+ # 🌐 WEBSITE CRAWLER + DOWNLOAD TOOL
3
  # =========================================================
4
 
5
  import streamlit as st
6
  import requests
7
  from bs4 import BeautifulSoup
8
+ import pandas as pd
 
 
 
 
9
  from urllib.parse import urljoin
10
 
 
 
 
11
  # ==============================
12
  # PAGE CONFIG
13
  # ==============================
14
+ st.set_page_config(page_title="🌐 Website Crawler", layout="wide")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
 
16
  # ==============================
17
  # SESSION STATE
18
  # ==============================
 
 
 
 
 
 
19
  if "links" not in st.session_state:
20
  st.session_state.links = []
21
 
22
+ if "data" not in st.session_state:
23
+ st.session_state.data = []
24
+
25
  # ==============================
26
  # CRAWL WEBSITE
27
  # ==============================
 
33
  links = set()
34
 
35
  for a in soup.find_all("a", href=True):
36
+ link = urljoin(url, a["href"])
37
  if link.startswith("http"):
38
  links.add(link)
39
 
40
+ return list(links)[:30]
41
 
42
+ except:
43
  return []
44
 
45
  # ==============================
46
+ # EXTRACT PAGE CONTENT
47
  # ==============================
48
+ def extract_page(url):
49
  try:
50
  res = requests.get(url, timeout=10)
51
  soup = BeautifulSoup(res.text, "html.parser")
 
54
  paragraphs = [p.get_text().strip() for p in soup.find_all("p")]
55
  text = " ".join(paragraphs)
56
 
57
+ # IMAGES
58
+ images = []
59
+ for img in soup.find_all("img"):
60
+ img_url = urljoin(url, img.get("src"))
61
+ images.append(img_url)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
62
 
63
+ return {
64
+ "url": url,
65
+ "text": text,
66
+ "images": images
67
+ }
68
 
69
  except:
70
+ return None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
71
 
72
  # ==============================
73
  # UI
74
  # ==============================
75
+ st.title("🌐 Website Crawler + Downloader")
76
 
77
  # ==============================
78
+ # STEP 1: ENTER URL
79
  # ==============================
80
  url = st.text_input("πŸ”— Enter Website URL")
81
 
 
86
  st.session_state.links = links
87
  st.success(f"Found {len(links)} pages")
88
  else:
89
+ st.error("No links found")
90
 
91
  # ==============================
92
+ # STEP 2: SELECT PAGES
93
  # ==============================
94
  selected_links = []
95
 
96
  if st.session_state.links:
97
+ st.subheader("πŸ“„ Select Pages to Crawl")
98
 
99
  for link in st.session_state.links:
100
  if st.checkbox(link):
101
  selected_links.append(link)
102
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
103
  # ==============================
104
+ # STEP 3: EXTRACT DATA
105
  # ==============================
106
+ if st.button("Extract Selected Pages"):
107
+ all_data = []
 
108
 
109
+ with st.spinner("Extracting content..."):
110
+ for link in selected_links:
111
+ data = extract_page(link)
112
+ if data:
113
+ all_data.append(data)
114
 
115
+ if all_data:
116
+ st.session_state.data = all_data
117
+ st.success("βœ… Data extracted successfully!")
 
 
 
 
 
118
  else:
119
+ st.warning("No data extracted")
120
 
121
  # ==============================
122
+ # STEP 4: SHOW DATA
123
  # ==============================
124
+ if st.session_state.data:
125
+ st.subheader("πŸ“Š Extracted Data Preview")
 
 
 
 
 
 
 
 
 
126
 
127
+ df = pd.DataFrame(st.session_state.data)
128
+ st.dataframe(df)
129
 
130
+ # ==============================
131
+ # STEP 5: DOWNLOAD OPTIONS
132
+ # ==============================
133
+ if st.session_state.data:
134
+ st.subheader("⬇️ Download Data")
 
 
 
 
 
 
135
 
136
+ df = pd.DataFrame(st.session_state.data)
137
 
138
+ # CSV
139
+ csv = df.to_csv(index=False).encode("utf-8")
140
+ st.download_button("Download CSV", csv, "website_data.csv", "text/csv")
141
 
142
+ # JSON
143
+ json_data = df.to_json(orient="records", indent=2)
144
+ st.download_button("Download JSON", json_data, "website_data.json", "application/json")