Spaces:

muddasser
/

Webscrapping_Playwright

Sleeping

App Files Files Community

muddasser commited on Mar 10

Commit

7a70581

verified ·

1 Parent(s): 2751049

Update app.py

Browse files

Files changed (1) hide show

app.py +318 -189

app.py CHANGED Viewed

@@ -2,29 +2,13 @@ import streamlit as st
 import os
 import re
 import logging
 from playwright.sync_api import sync_playwright
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 from langchain_community.vectorstores import FAISS
 from langchain_community.embeddings import HuggingFaceEmbeddings
 from langchain.schema import Document
-# Try importing transformers with fallback
-try:
-    from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
-    import transformers
-    logging.info(f"Transformers version: {transformers.__version__}")
-except ImportError as e:
-    st.error(f"Failed to import transformers: {str(e)}. Attempting fallback without pipeline.")
-    logging.error(f"Transformers import failed: {str(e)}")
-    try:
-        from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
-        import transformers
-        logging.info(f"Fallback: Imported AutoTokenizer and AutoModelForSeq2SeqLM, version: {transformers.__version__}")
-    except ImportError as e:
-        st.error(f"Failed to import transformers fallback: {str(e)}. Please ensure transformers==4.44.2 and tokenizers==0.19.1 are installed.")
-        logging.error(f"Transformers fallback import failed: {str(e)}")
-        st.stop()
 # Set up logging
 logging.basicConfig(
     filename='/app/cache/app.log',
@@ -32,224 +16,369 @@ logging.basicConfig(
     format='%(asctime)s - %(levelname)s - %(message)s'
 )
-# Set page configuration
 st.set_page_config(
-    page_title="Web Scraping + RAG Chatbot",
-    page_icon="🕷️",
     layout="wide",
     initial_sidebar_state="expanded"
 )
-# App title and description
-st.title("🕷️ Web Scraping + RAG Chatbot")
 st.markdown("""
-This app combines web scraping with Retrieval-Augmented Generation (RAG) to create an intelligent chatbot.
-Enter a URL to scrape its content, then ask questions about the scraped data.
-""")
-# Initialize session state
 if 'scraped_content' not in st.session_state:
     st.session_state.scraped_content = ""
 if 'vector_store' not in st.session_state:
     st.session_state.vector_store = None
 if 'chat_history' not in st.session_state:
     st.session_state.chat_history = []
-if 'qa_pipeline' not in st.session_state:
-    st.session_state.qa_pipeline = None
-def clean_text(text):
-    """Clean and normalize scraped text."""
-    try:
-        text = re.sub(r'\s+', ' ', text)
-        text = re.sub(r'[^\w\s.,!?;:]', ' ', text)
-        return text.strip()
-    except Exception as e:
-        logging.error(f"Error cleaning text: {str(e)}")
-        return text
-def scrape_website(url):
-    """Scrape data from the given URL using Playwright."""
-    logging.info(f"Starting scrape for URL: {url}")
     with sync_playwright() as p:
-        browser = p.chromium.launch(headless=True, args=['--no-sandbox', '--disable-dev-shm-usage'])
         page = browser.new_page()
         try:
-            logging.info(f"Navigating to {url}")
             page.goto(url, wait_until="domcontentloaded", timeout=30000)
             title = page.title()
-            content_selectors = [
-                "#content",
-                ".mw-parser-output",
-                "main",
-                ".main-content",
-                "#main",
-                "article"
-            ]
-            main_content = None
-            for selector in content_selectors:
                 try:
-                    main_content = page.query_selector(selector)
-                    if main_content:
-                        logging.info(f"Found content with selector: {selector}")
                         break
-                except:
                     continue
-            if not main_content:
-                main_content = page.query_selector("body")
-                logging.info("Falling back to body tag for content")
-            text_content = main_content.inner_text()
-            cleaned_content = clean_text(text_content)
-            logging.info(f"Scraped {len(cleaned_content)} characters from {url}")
-            return {
-                "title": title,
-                "content": cleaned_content,
-                "url": url
-            }
         except Exception as e:
-            logging.error(f"Error scraping {url}: {str(e)}")
-            st.error(f"Error scraping {url}: {str(e)}")
             return None
         finally:
             browser.close()
-@st.cache_resource
-def initialize_qa_model():
-    """Initialize the QA model with fallback."""
-    if st.session_state.qa_pipeline is None:
-        try:
-            with st.spinner("Loading FLAN-T5 model..."):
-                model_name = "google/flan-t5-small"
-                tokenizer = AutoTokenizer.from_pretrained(model_name)
-                model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
-                try:
-                    st.session_state.qa_pipeline = pipeline(
-                        "text2text-generation",
-                        model=model,
-                        tokenizer=tokenizer,
-                        max_length=200
-                    )
-                    logging.info("Initialized QA pipeline successfully")
-                except NameError:
-                    logging.warning("Pipeline not available, using raw model and tokenizer")
-                    st.session_state.qa_pipeline = (model, tokenizer)
-                return st.session_state.qa_pipeline
-        except Exception as e:
-            st.error(f"Failed to load QA model: {str(e)}")
-            logging.error(f"Error loading QA model: {str(e)}")
-            return None
-    return st.session_state.qa_pipeline
 @st.cache_resource
-def create_vector_store(text):
-    """Create a FAISS vector store."""
     try:
-        text_splitter = RecursiveCharacterTextSplitter(
-            chunk_size=500,
-            chunk_overlap=50,
-            length_function=len
         )
-        documents = [Document(page_content=chunk) for chunk in text_splitter.split_text(text)]
         embeddings = HuggingFaceEmbeddings(
             model_name="sentence-transformers/all-MiniLM-L6-v2",
             model_kwargs={'device': 'cpu'}
         )
-        vector_store = FAISS.from_documents(documents, embeddings)
-        logging.info("FAISS vector store created successfully")
-        return vector_store
     except Exception as e:
-        st.error(f"Error creating vector store: {str(e)}")
-        logging.error(f"Error creating vector store: {str(e)}")
         return None
-def answer_question(question):
-    """Answer a question using RAG with fallback."""
     if st.session_state.vector_store is None:
-        return "Please scrape a website first."
-    if st.session_state.qa_pipeline is None:
-        return "QA model not loaded."
     try:
-        relevant_docs = st.session_state.vector_store.similarity_search(question, k=3)
-        context = " ".join([doc.page_content for doc in relevant_docs])
-        prompt = f"""
-        Based on the context, answer the question. If the answer is not in the context, say "I don't know".
-        Context: {context}
-        Question: {question}
-        Answer:
-        """
-        if isinstance(st.session_state.qa_pipeline, tuple):
-            # Fallback: Use raw model and tokenizer
-            model, tokenizer = st.session_state.qa_pipeline
-            inputs = tokenizer(prompt, return_tensors="pt", max_length=512, truncation=True)
-            outputs = model.generate(**inputs, max_length=200, do_sample=False, temperature=0.3)
-            answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
-        else:
-            # Use pipeline
-            result = st.session_state.qa_pipeline(
-                prompt,
-                max_length=200,
-                do_sample=False,
-                temperature=0.3
-            )
-            answer = result[0]['generated_text']
-        return answer.strip()
     except Exception as e:
-        logging.error(f"Error answering question: {str(e)}")
-        return f"Error generating answer: {str(e)}"
-def is_valid_url(url):
-    """Validate URL format."""
-    pattern = r'^https?://[\w\-\.]+(?:\:\d+)?(?:/[\w\-\./]*)*$'
-    return bool(re.match(pattern, url))
-# Sidebar navigation
-st.sidebar.title("Navigation")
-app_mode = st.sidebar.radio("Choose a mode", ["Web Scraping", "Chat with Content", "About"])
-if app_mode == "Web Scraping":
-    st.header("🌐 Web Scraping")
-    url = st.text_input("Enter URL to scrape", "https://example.com")
-    if st.button("Scrape Website"):
-        if url and is_valid_url(url):
-            with st.spinner("Scraping website..."):
-                result = scrape_website(url)
-                if result:
-                    st.success(f"Successfully scraped: {result['title']}")
-                    st.session_state.scraped_content = result['content']
-                    with st.spinner("Indexing content..."):
-                        st.session_state.vector_store = create_vector_store(result['content'])
-                    initialize_qa_model()
-                    with st.expander("View scraped content"):
-                        st.text_area("Content", result['content'], height=300)
-                else:
-                    st.error("Failed to scrape the website. Check logs at /app/cache/app.log.")
         else:
-            st.warning("Please enter a valid URL (e.g., https://example.com).")
-elif app_mode == "Chat with Content":
-    st.header("💬 Chat with Scraped Content")
     if st.session_state.vector_store is None:
-        st.info("Please scrape a website first to enable chatting.")
-        st.stop()
-    for message in st.session_state.chat_history:
-        with st.chat_message(message["role"]):
-            st.markdown(message["content"])
-    if prompt := st.chat_input("Ask a question about the scraped content"):
-        st.session_state.chat_history.append({"role": "user", "content": prompt})
-        with st.chat_message("user"):
-            st.markdown(prompt)
-        with st.chat_message("assistant"):
-            with st.spinner("Generating answer..."):
-                answer = answer_question(prompt)
                 st.markdown(answer)
-                st.session_state.chat_history.append({"role": "assistant", "content": answer})
-elif app_mode == "About":
-    st.header("ℹ️ About")
-    st.markdown("""
-    This app uses Playwright for web scraping, LangChain for vector storage with FAISS,
-    and Hugging Face models for embeddings and question answering.
-    - **Web Scraping**: Extracts text using headless Chromium via Playwright.
-    - **RAG**: Indexes content with sentence-transformers and answers questions using FLAN-T5.
-    - **Tech Stack**: Python, Streamlit, Playwright, LangChain, Hugging Face Transformers, FAISS.
-    - **Docker**: Runs in a containerized environment.
-    """)

 import os
 import re
 import logging
+import requests
 from playwright.sync_api import sync_playwright
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 from langchain_community.vectorstores import FAISS
 from langchain_community.embeddings import HuggingFaceEmbeddings
 from langchain.schema import Document
 # Set up logging
 logging.basicConfig(
     filename='/app/cache/app.log',
     format='%(asctime)s - %(levelname)s - %(message)s'
 )
+# ── Page config ────────────────────────────────────────────────────────────────
 st.set_page_config(
+    page_title="RAG Chatbot · Mistral",
+    page_icon="🕸️",
     layout="wide",
     initial_sidebar_state="expanded"
 )
+# ── Custom CSS ─────────────────────────────────────────────────────────────────
+st.markdown("""
+<style>
+@import url('https://fonts.googleapis.com/css2?family=Syne:wght@400;700;800&family=DM+Mono:ital,wght@0,400;0,500;1,400&display=swap');
+html, body, [class*="css"] {
+    font-family: 'DM Mono', monospace;
+    background-color: #0d0d0d;
+    color: #e8e2d4;
+}
+h1, h2, h3 {
+    font-family: 'Syne', sans-serif;
+    letter-spacing: -0.02em;
+}
+.stApp {
+    background: #0d0d0d;
+}
+/* Sidebar */
+[data-testid="stSidebar"] {
+    background: #111111;
+    border-right: 1px solid #2a2a2a;
+}
+/* Inputs */
+.stTextInput > div > div > input,
+.stTextArea textarea {
+    background: #1a1a1a !important;
+    border: 1px solid #2e2e2e !important;
+    border-radius: 4px !important;
+    color: #e8e2d4 !important;
+    font-family: 'DM Mono', monospace !important;
+}
+/* Buttons */
+.stButton > button {
+    background: #c8f135 !important;
+    color: #0d0d0d !important;
+    border: none !important;
+    border-radius: 4px !important;
+    font-family: 'Syne', sans-serif !important;
+    font-weight: 700 !important;
+    letter-spacing: 0.05em !important;
+    text-transform: uppercase !important;
+    padding: 0.5rem 1.5rem !important;
+    transition: all 0.15s ease !important;
+}
+.stButton > button:hover {
+    background: #d9ff45 !important;
+    transform: translateY(-1px);
+    box-shadow: 0 4px 20px rgba(200,241,53,0.3) !important;
+}
+/* Chat messages */
+[data-testid="stChatMessage"] {
+    background: #161616 !important;
+    border: 1px solid #242424 !important;
+    border-radius: 6px !important;
+    margin-bottom: 0.5rem !important;
+}
+/* Chat input */
+[data-testid="stChatInput"] textarea {
+    background: #1a1a1a !important;
+    border: 1px solid #2e2e2e !important;
+    color: #e8e2d4 !important;
+    font-family: 'DM Mono', monospace !important;
+}
+/* Status / info boxes */
+.stAlert {
+    background: #1a1a1a !important;
+    border: 1px solid #2e2e2e !important;
+    border-radius: 4px !important;
+}
+/* Expander */
+.streamlit-expanderHeader {
+    background: #161616 !important;
+    border: 1px solid #2a2a2a !important;
+    font-family: 'DM Mono', monospace !important;
+}
+/* Accent tag */
+.tag {
+    display: inline-block;
+    background: #c8f135;
+    color: #0d0d0d;
+    font-family: 'Syne', sans-serif;
+    font-weight: 700;
+    font-size: 0.7rem;
+    letter-spacing: 0.1em;
+    text-transform: uppercase;
+    padding: 2px 8px;
+    border-radius: 2px;
+    margin-right: 6px;
+}
+.status-bar {
+    display: flex;
+    align-items: center;
+    gap: 8px;
+    padding: 10px 14px;
+    background: #161616;
+    border: 1px solid #242424;
+    border-radius: 4px;
+    margin-bottom: 1rem;
+    font-size: 0.8rem;
+    color: #888;
+}
+.status-dot {
+    width: 8px;
+    height: 8px;
+    border-radius: 50%;
+    background: #444;
+}
+.status-dot.active {
+    background: #c8f135;
+    box-shadow: 0 0 6px rgba(200,241,53,0.6);
+}
+</style>
+""", unsafe_allow_html=True)
+# ── Header ─────────────────────────────────────────────────────────────────────
 st.markdown("""
+<div style="padding: 2rem 0 1rem 0;">
+    <span class="tag">RAG</span>
+    <h1 style="display:inline; font-size:2.2rem; color:#e8e2d4;">Web Scraper × Mistral</h1>
+    <p style="color:#666; font-size:0.85rem; margin-top:0.5rem; font-family:'DM Mono',monospace;">
+        Scrape any URL → index with FAISS → chat with Mistral 7B via Ollama
+    </p>
+</div>
+""", unsafe_allow_html=True)
+# ── Session state ──────────────────────────────────────────────────────────────
 if 'scraped_content' not in st.session_state:
     st.session_state.scraped_content = ""
 if 'vector_store' not in st.session_state:
     st.session_state.vector_store = None
 if 'chat_history' not in st.session_state:
     st.session_state.chat_history = []
+if 'scraped_title' not in st.session_state:
+    st.session_state.scraped_title = None
+# ── Ollama config ──────────────────────────────────────────────────────────────
+OLLAMA_BASE_URL = os.getenv("OLLAMA_BASE_URL", "http://localhost:11434")
+OLLAMA_MODEL    = os.getenv("OLLAMA_MODEL", "mistral")
+# ── Helpers ────────────────────────────────────────────────────────────────────
+def clean_text(text: str) -> str:
+    text = re.sub(r'\s+', ' ', text)
+    text = re.sub(r'[^\w\s.,!?;:]', ' ', text)
+    return text.strip()
+def is_valid_url(url: str) -> bool:
+    pattern = r'^https?://[\w\-\.]+(?::\d+)?(?:/[\w\-\./]*)*$'
+    return bool(re.match(pattern, url))
+def scrape_website(url: str):
+    logging.info(f"Scraping: {url}")
     with sync_playwright() as p:
+        browser = p.chromium.launch(
+            headless=True,
+            args=['--no-sandbox', '--disable-dev-shm-usage']
+        )
         page = browser.new_page()
         try:
             page.goto(url, wait_until="domcontentloaded", timeout=30000)
             title = page.title()
+            selectors = ["#content", ".mw-parser-output", "main",
+                         ".main-content", "#main", "article"]
+            el = None
+            for sel in selectors:
                 try:
+                    el = page.query_selector(sel)
+                    if el:
                         break
+                except Exception:
                     continue
+            if not el:
+                el = page.query_selector("body")
+            text = clean_text(el.inner_text())
+            logging.info(f"Scraped {len(text)} chars")
+            return {"title": title, "content": text, "url": url}
         except Exception as e:
+            logging.error(f"Scrape error: {e}")
+            st.error(f"Scraping failed: {e}")
             return None
         finally:
             browser.close()
 @st.cache_resource
+def create_vector_store(text: str):
     try:
+        splitter = RecursiveCharacterTextSplitter(
+            chunk_size=500, chunk_overlap=50, length_function=len
         )
+        docs = [Document(page_content=c) for c in splitter.split_text(text)]
         embeddings = HuggingFaceEmbeddings(
             model_name="sentence-transformers/all-MiniLM-L6-v2",
             model_kwargs={'device': 'cpu'}
         )
+        vs = FAISS.from_documents(docs, embeddings)
+        logging.info("Vector store created")
+        return vs
     except Exception as e:
+        logging.error(f"Vector store error: {e}")
+        st.error(f"Indexing failed: {e}")
         return None
+def check_ollama() -> bool:
+    try:
+        r = requests.get(f"{OLLAMA_BASE_URL}/api/tags", timeout=3)
+        return r.status_code == 200
+    except Exception:
+        return False
+def answer_question(question: str) -> str:
     if st.session_state.vector_store is None:
+        return "No content indexed yet — please scrape a website first."
     try:
+        docs = st.session_state.vector_store.similarity_search(question, k=3)
+        context = " ".join(d.page_content for d in docs)
+        prompt = (
+            "You are a helpful assistant. Answer the question using ONLY the "
+            "context below. If the answer is not in the context, say \"I don't know\".\n\n"
+            f"Context:\n{context}\n\n"
+            f"Question: {question}\n\n"
+            "Answer:"
+        )
+        payload = {
+            "model": OLLAMA_MODEL,
+            "messages": [{"role": "user", "content": prompt}],
+            "stream": False
+        }
+        resp = requests.post(
+            f"{OLLAMA_BASE_URL}/api/chat",
+            json=payload,
+            timeout=120
+        )
+        resp.raise_for_status()
+        return resp.json()["message"]["content"].strip()
+    except requests.exceptions.ConnectionError:
+        return (
+            "⚠️ Cannot reach Ollama. Make sure Ollama is running and "
+            f"`{OLLAMA_BASE_URL}` is accessible."
+        )
     except Exception as e:
+        logging.error(f"Answer error: {e}")
+        return f"Error generating answer: {e}"
+# ── Sidebar ────────────────────────────────────────────────────────────────────
+with st.sidebar:
+    st.markdown("<h3 style='font-family:Syne,sans-serif;'>Settings</h3>", unsafe_allow_html=True)
+    ollama_url = st.text_input("Ollama URL", value=OLLAMA_BASE_URL)
+    model_name = st.text_input("Model", value=OLLAMA_MODEL)
+    OLLAMA_BASE_URL = ollama_url
+    OLLAMA_MODEL    = model_name
+    st.markdown("---")
+    # Ollama status
+    alive = check_ollama()
+    dot_class = "active" if alive else ""
+    status_text = "Ollama connected" if alive else "Ollama not found"
+    st.markdown(f"""
+    <div class="status-bar">
+        <div class="status-dot {dot_class}"></div>
+        <span>{status_text}</span>
+    </div>
+    """, unsafe_allow_html=True)
+    if st.session_state.scraped_title:
+        st.markdown(f"""
+        <div class="status-bar">
+            <div class="status-dot active"></div>
+            <span>Indexed: {st.session_state.scraped_title[:30]}…</span>
+        </div>
+        """, unsafe_allow_html=True)
+    st.markdown("---")
+    st.markdown("""
+    <div style='font-size:0.75rem; color:#555; font-family:"DM Mono",monospace;'>
+    <b style='color:#888;'>Stack</b><br>
+    Playwright · FAISS<br>
+    MiniLM embeddings<br>
+    Mistral 7B via Ollama
+    </div>
+    """, unsafe_allow_html=True)
+    if not alive:
+        st.markdown("""
+        <div style='font-size:0.75rem; color:#c8f135; margin-top:1rem;'>
+        To start Ollama:<br><br>
+        <code style='color:#aaa;'>ollama serve</code><br>
+        <code style='color:#aaa;'>ollama pull mistral</code>
+        </div>
+        """, unsafe_allow_html=True)
+# ── Main tabs ──────────────────────────────────────────────────────────────────
+tab1, tab2 = st.tabs(["🌐  Scrape", "💬  Chat"])
+# ── Tab 1: Scrape ──────────────────────────────────────────────────────────────
+with tab1:
+    st.markdown("### Enter a URL to scrape and index")
+    url_input = st.text_input("URL", placeholder="https://en.wikipedia.org/wiki/Mistral_AI")
+    if st.button("Scrape & Index"):
+        if not url_input or not is_valid_url(url_input):
+            st.warning("Please enter a valid URL starting with http:// or https://")
         else:
+            with st.spinner("Scraping…"):
+                result = scrape_website(url_input)
+            if result:
+                st.session_state.scraped_content = result['content']
+                st.session_state.scraped_title   = result['title']
+                with st.spinner("Building FAISS index…"):
+                    st.session_state.vector_store = create_vector_store(result['content'])
+                st.success(f"✓ Indexed **{result['title']}** — {len(result['content']):,} characters")
+                with st.expander("Preview scraped text"):
+                    st.text_area("", result['content'][:3000] + "…", height=250)
+# ── Tab 2: Chat ────────────────────────────────────────────────────────────────
+with tab2:
     if st.session_state.vector_store is None:
+        st.info("Scrape a website first (tab above), then come back to chat.")
+    else:
+        # Render history
+        for msg in st.session_state.chat_history:
+            with st.chat_message(msg["role"]):
+                st.markdown(msg["content"])
+        # New input
+        if prompt := st.chat_input("Ask anything about the scraped content…"):
+            st.session_state.chat_history.append({"role": "user", "content": prompt})
+            with st.chat_message("user"):
+                st.markdown(prompt)
+            with st.chat_message("assistant"):
+                with st.spinner("Mistral is thinking…"):
+                    answer = answer_question(prompt)
                 st.markdown(answer)
+            st.session_state.chat_history.append({"role": "assistant", "content": answer})
+        if st.session_state.chat_history:
+            if st.button("Clear chat"):
+                st.session_state.chat_history = []
+                st.rerun()