Spaces:

pradeep4321
/

sample_crawler

Sleeping

App Files Files Community

pradeep4321 commited on Apr 15

Commit

2f54431

verified ·

1 Parent(s): ec67f77

Update src/streamlit_app.py

Browse files

Files changed (1) hide show

src/streamlit_app.py +75 -54

src/streamlit_app.py CHANGED Viewed

@@ -1,5 +1,5 @@
 # =========================================================
-# 🌐 WEBSITE RAG + IMAGE UNDERSTANDING (HF SPACES)
 # =========================================================
 import streamlit as st
@@ -10,6 +10,7 @@ import faiss
 import torch
 from PIL import Image
 from io import BytesIO
 from sentence_transformers import SentenceTransformer
 from transformers import pipeline, BlipProcessor, BlipForConditionalGeneration
@@ -20,16 +21,18 @@ from transformers import pipeline, BlipProcessor, BlipForConditionalGeneration
 st.set_page_config(page_title="🌐 Website QA System", layout="wide")
 # ==============================
-# LOAD MODELS
 # ==============================
 @st.cache_resource
 def load_models():
     embed_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
     qa_pipeline = pipeline(
-        "text2text-generation",
         model="google/flan-t5-base",
-        max_length=256
     )
     processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
@@ -50,47 +53,50 @@ if "documents" not in st.session_state:
 if "index" not in st.session_state:
     st.session_state.index = None
 # ==============================
 # CRAWL WEBSITE
 # ==============================
 def crawl_website(url):
     try:
-        res = requests.get(url)
         soup = BeautifulSoup(res.text, "html.parser")
-        links = []
         for a in soup.find_all("a", href=True):
-            link = a["href"]
             if link.startswith("http"):
-                links.append(link)
-        return list(set(links))[:20]  # limit
-    except:
         return []
 # ==============================
-# EXTRACT CONTENT
 # ==============================
 def extract_content(url):
     try:
-        res = requests.get(url)
         soup = BeautifulSoup(res.text, "html.parser")
         # TEXT
-        paragraphs = [p.get_text() for p in soup.find_all("p")]
         text = " ".join(paragraphs)
         # IMAGES → CAPTION
         image_texts = []
         images = soup.find_all("img")
-        for img in images[:5]:  # limit images
             try:
-                img_url = img.get("src")
-                if not img_url.startswith("http"):
-                    continue
-                img_res = requests.get(img_url)
                 image = Image.open(BytesIO(img_res.content)).convert("RGB")
                 inputs = processor(image, return_tensors="pt")
@@ -102,8 +108,7 @@ def extract_content(url):
             except:
                 continue
-        full_text = text + " " + " ".join(image_texts)
-        return full_text
     except:
         return ""
@@ -113,13 +118,10 @@ def extract_content(url):
 # ==============================
 def chunk_text(text, size=300):
     words = text.split()
-    chunks = []
-    for i in range(0, len(words), size):
-        chunks.append(" ".join(words[i:i+size]))
-    return chunks
 # ==============================
-# BUILD INDEX
 # ==============================
 def build_index(texts):
     embeddings = embed_model.encode(texts)
@@ -128,13 +130,24 @@ def build_index(texts):
     index = faiss.IndexFlatL2(dim)
     index.add(np.array(embeddings))
-    return index, embeddings
 # ==============================
 # UI
 # ==============================
-st.title("🌐 Website QA with Images")
 url = st.text_input("🔗 Enter Website URL")
 if st.button("Crawl Website"):
@@ -144,15 +157,16 @@ if st.button("Crawl Website"):
         st.session_state.links = links
         st.success(f"Found {len(links)} pages")
     else:
-        st.error("No links found")
 # ==============================
-# PAGE SELECTION
 # ==============================
-if "links" in st.session_state:
-    st.subheader("Select Pages to Train")
-    selected_links = []
     for link in st.session_state.links:
         if st.checkbox(link):
             selected_links.append(link)
@@ -167,35 +181,37 @@ if "links" in st.session_state:
                 all_chunks.extend(chunks)
         if all_chunks:
-            index, embeddings = build_index(all_chunks)
             st.session_state.documents = all_chunks
-            st.session_state.index = index
-            st.success("Training completed!")
 # ==============================
-# ADD MORE PAGES
 # ==============================
-if "links" in st.session_state:
-    st.subheader("➕ Add More Pages")
-    new_url = st.text_input("Add another URL")
-    if st.button("Add & Train"):
-        content = extract_content(new_url)
-        chunks = chunk_text(content)
-        if chunks:
-            new_embeddings = embed_model.encode(chunks)
-            st.session_state.index.add(np.array(new_embeddings))
-            st.session_state.documents.extend(chunks)
-            st.success("Added new page!")
 # ==============================
-# ASK QUESTIONS
 # ==============================
 st.subheader("💬 Ask Questions")
@@ -203,7 +219,7 @@ query = st.text_input("Ask something from the website")
 if st.button("Get Answer"):
     if st.session_state.index is None:
-        st.warning("Please train pages first")
     else:
         q_embed = embed_model.encode([query])
@@ -212,16 +228,21 @@ if st.button("Get Answer"):
         context = " ".join([st.session_state.documents[i] for i in I[0]])
         prompt = f"""
-        Answer the question based on the context.
         Context:
         {context}
         Question:
         {query}
         """
-        answer = qa_pipeline(prompt)[0]["generated_text"]
         st.write("### ✅ Answer")
-        st.write(answer)

 # =========================================================
+# 🌐 WEBSITE RAG + IMAGE QA (HF SPACES FIXED VERSION)
 # =========================================================
 import streamlit as st
 import torch
 from PIL import Image
 from io import BytesIO
+from urllib.parse import urljoin
 from sentence_transformers import SentenceTransformer
 from transformers import pipeline, BlipProcessor, BlipForConditionalGeneration
 st.set_page_config(page_title="🌐 Website QA System", layout="wide")
 # ==============================
+# LOAD MODELS (FIXED)
 # ==============================
 @st.cache_resource
 def load_models():
     embed_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
+    # ✅ FIX: use text-generation instead of text2text-generation
     qa_pipeline = pipeline(
+        "text-generation",
         model="google/flan-t5-base",
+        max_length=256,
+        do_sample=False
     )
     processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
 if "index" not in st.session_state:
     st.session_state.index = None
+if "links" not in st.session_state:
+    st.session_state.links = []
 # ==============================
 # CRAWL WEBSITE
 # ==============================
 def crawl_website(url):
     try:
+        res = requests.get(url, timeout=10)
         soup = BeautifulSoup(res.text, "html.parser")
+        links = set()
         for a in soup.find_all("a", href=True):
+            link = urljoin(url, a["href"])  # ✅ FIX relative links
             if link.startswith("http"):
+                links.add(link)
+        return list(links)[:20]
+    except Exception as e:
         return []
 # ==============================
+# EXTRACT CONTENT (TEXT + IMAGES)
 # ==============================
 def extract_content(url):
     try:
+        res = requests.get(url, timeout=10)
         soup = BeautifulSoup(res.text, "html.parser")
         # TEXT
+        paragraphs = [p.get_text().strip() for p in soup.find_all("p")]
         text = " ".join(paragraphs)
         # IMAGES → CAPTION
         image_texts = []
         images = soup.find_all("img")
+        for img in images[:5]:  # limit
             try:
+                img_url = urljoin(url, img.get("src"))
+                img_res = requests.get(img_url, timeout=5)
                 image = Image.open(BytesIO(img_res.content)).convert("RGB")
                 inputs = processor(image, return_tensors="pt")
             except:
                 continue
+        return text + " " + " ".join(image_texts)
     except:
         return ""
 # ==============================
 def chunk_text(text, size=300):
     words = text.split()
+    return [" ".join(words[i:i+size]) for i in range(0, len(words), size)]
 # ==============================
+# BUILD FAISS INDEX
 # ==============================
 def build_index(texts):
     embeddings = embed_model.encode(texts)
     index = faiss.IndexFlatL2(dim)
     index.add(np.array(embeddings))
+    return index
+# ==============================
+# ADD TO EXISTING INDEX
+# ==============================
+def add_to_index(new_chunks):
+    new_embeddings = embed_model.encode(new_chunks)
+    st.session_state.index.add(np.array(new_embeddings))
+    st.session_state.documents.extend(new_chunks)
 # ==============================
 # UI
 # ==============================
+st.title("🌐 Website QA with Images (Fixed)")
+# ==============================
+# STEP 1: URL INPUT
+# ==============================
 url = st.text_input("🔗 Enter Website URL")
 if st.button("Crawl Website"):
         st.session_state.links = links
         st.success(f"Found {len(links)} pages")
     else:
+        st.error("No links found or invalid URL")
 # ==============================
+# STEP 2: PAGE SELECTION
 # ==============================
+selected_links = []
+if st.session_state.links:
+    st.subheader("📄 Select Pages to Train")
     for link in st.session_state.links:
         if st.checkbox(link):
             selected_links.append(link)
                 all_chunks.extend(chunks)
         if all_chunks:
+            st.session_state.index = build_index(all_chunks)
             st.session_state.documents = all_chunks
+            st.success("✅ Training completed!")
+        else:
+            st.warning("No content extracted")
 # ==============================
+# STEP 3: ADD MORE PAGES
 # ==============================
+st.subheader("➕ Add More Pages")
+new_url = st.text_input("Enter another page URL")
+if st.button("Add & Train"):
+    content = extract_content(new_url)
+    chunks = chunk_text(content)
+    if chunks:
+        if st.session_state.index is None:
+            st.session_state.index = build_index(chunks)
+            st.session_state.documents = chunks
+        else:
+            add_to_index(chunks)
+        st.success("✅ Page added successfully!")
+    else:
+        st.error("Failed to extract content")
 # ==============================
+# STEP 4: ASK QUESTIONS
 # ==============================
 st.subheader("💬 Ask Questions")
 if st.button("Get Answer"):
     if st.session_state.index is None:
+        st.warning("⚠️ Please train pages first")
     else:
         q_embed = embed_model.encode([query])
         context = " ".join([st.session_state.documents[i] for i in I[0]])
         prompt = f"""
+        Answer based only on the context.
         Context:
         {context}
         Question:
         {query}
+        Answer:
         """
+        response = qa_pipeline(prompt)[0]["generated_text"]
+        # ✅ CLEAN OUTPUT
+        answer = response.replace(prompt, "").strip()
         st.write("### ✅ Answer")
+        st.write(answer if answer else "No relevant answer found")