Spaces:

Ansnaeem
/

MSDSF25M005-Ver3

Sleeping

App Files Files Community

Ansnaeem commited on Mar 2

Commit

435f8ee

verified ·

1 Parent(s): 7ad7d39

Upload 4 files

Browse files

Files changed (4) hide show

README.md +57 -0
app.py +399 -0
requirements.txt +6 -0
storage.py +71 -0

README.md ADDED Viewed

	@@ -0,0 +1,57 @@

+---
+title: Scraper Bot v3 — Multi-Turn Chat with Persistent Storage
+emoji: 🧠
+colorFrom: purple
+colorTo: indigo
+sdk: gradio
+sdk_version: 4.0.0
+app_file: app.py
+pinned: false
+---
+# Version 3 — Multi-Turn AI Chatbot with Persistent Storage
+AI assistant with **session and cross-session memory** and **editable user preferences**.
+## Features
+- **Multi-turn conversation (short-term / session memory)**
+  Conversation history is kept in the format `[{"role": "user"|"assistant", "content": "..."}]` and sent to the API on every turn so the model can answer follow-up questions (e.g. *"Tell me more about the second book"*).
+- **Persistent storage (cross-session memory)**
+  A local JSON file (`chat_storage.json`) stores:
+  - Conversation history for both tabs (Website Scraper and YouTube Transcript).
+  - On startup, existing history is loaded and shown in the chat.
+  - After each turn, the updated history is written to the file.
+- **Editable user preferences**
+  You can set preferences at run time (e.g. *"Always respond formally"*, *"Cite sources when summarizing"*). They are saved in the same JSON file and **injected into the system prompt on every API call**.
+## Tabs
+1. **Bot-Protected Website Scraper** — Enter a URL (e.g. Goodreads), scrape with Bright Data + BeautifulSoup, then chat with multi-turn memory.
+2. **YouTube Transcript Q&A** — Enter a video ID or URL, fetch transcript with `youtube-transcript-api`, then chat with multi-turn memory.
+## Hugging Face Spaces
+1. Create a new Space, SDK **Gradio**.
+2. Clone the repo and add `app.py`, `storage.py`, and `requirements.txt`.
+3. In **Settings → Repository secrets** add:
+   - `GROQ_API_KEY`
+   - `BRIGHTDATA_API_KEY`
+4. Push; the app will build and run.
+   Note: On Spaces, `chat_storage.json` is not persistent across restarts unless you use a persistent volume.
+## Local run
+```bash
+pip install -r requirements.txt
+```
+Create a `.env` file with `GROQ_API_KEY` and `BRIGHTDATA_API_KEY`, then:
+```bash
+python app.py
+```
+Open `http://127.0.0.1:7862` (port 7862 to avoid conflict with v2).

app.py ADDED Viewed

	@@ -0,0 +1,399 @@

+"""
+Version 3 — Multi-Turn AI Chatbot with Persistent Storage
+- Multi-turn conversation: full history sent to API so the model can answer follow-ups.
+- Persistent storage: chat_history.json holds conversation history and user preferences across restarts.
+- Editable user preferences: injected into the system prompt on every API call.
+"""
+import os
+import re
+import urllib3
+from urllib.parse import urlparse
+import gradio as gr
+import requests
+from bs4 import BeautifulSoup
+from dotenv import load_dotenv
+from groq import Groq
+from youtube_transcript_api import YouTubeTranscriptApi
+from storage import load_storage, save_storage
+load_dotenv()
+GROQ_API_KEY = os.getenv("GROQ_API_KEY")
+BRIGHTDATA_API_KEY = os.getenv("BRIGHTDATA_API_KEY")
+if not GROQ_API_KEY:
+    raise ValueError("GROQ_API_KEY is not set.")
+client = Groq(api_key=GROQ_API_KEY)
+urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
+# In-memory: scraped/transcript context + loaded storage (updated on save)
+contexts = {"scraper": "", "youtube": ""}
+storage = load_storage()
+# Do NOT clear history on startup; persist full conversation across restarts.
+# ---------------------------------------------------------------------------
+# Tab 1: Bot-Protected Website Scraper
+# ---------------------------------------------------------------------------
+def scrape_website(url: str):
+    if not url:
+        return "Please enter a URL.", ""
+    parsed = urlparse(url)
+    target_url = url
+    if "goodreads.com" in (parsed.netloc or "") and (parsed.path in ("", "/")):
+        target_url = "https://www.goodreads.com/list/show/1.Best_Books_Ever"
+    api_url = "https://3.232.71.244/request"
+    headers = {
+        "Authorization": f"Bearer {BRIGHTDATA_API_KEY}",
+        "Content-Type": "application/json",
+        "Host": "api.brightdata.com",
+    }
+    payload = {"zone": "goodreads_unlocker", "url": target_url, "format": "raw"}
+    try:
+        resp = requests.post(
+            api_url, json=payload, headers=headers, timeout=120, verify=False
+        )
+        resp.raise_for_status()
+        soup = BeautifulSoup(resp.text, "html.parser")
+        if "goodreads.com/list/show/1.Best_Books_Ever" in target_url:
+            books_data = []
+            book_rows = soup.find_all("tr", itemtype="http://schema.org/Book")
+            for idx, row in enumerate(book_rows):
+                title_elem = row.find("a", class_="bookTitle")
+                author_elem = row.find("a", class_="authorName")
+                rating_elem = row.find("span", class_="minirating")
+                title = title_elem.text.strip() if title_elem else "Unknown Title"
+                author = author_elem.text.strip() if author_elem else "Unknown Author"
+                rating = rating_elem.text.strip() if rating_elem else "Unknown Rating"
+                books_data.append(
+                    {"Rank": idx + 1, "Title": title, "Author": author, "Rating": rating}
+                )
+            if books_data:
+                lines = [
+                    "Here is the scraped data from Goodreads Best Books Ever list:",
+                    "",
+                ]
+                for b in books_data:
+                    lines.append(
+                        f"{b['Rank']}. {b['Title']} by {b['Author']} - {b['Rating']}"
+                    )
+                text_content = "\n".join(lines)
+            else:
+                text_content = soup.get_text(separator=" ", strip=True)
+        else:
+            text_content = soup.get_text(separator=" ", strip=True)
+        contexts["scraper"] = text_content[:15000]
+        preview = (
+            text_content[:500] + "..." if len(text_content) > 500 else text_content
+        )
+        return "Website scraped successfully. You can now chat about it.", preview
+    except Exception as e:
+        contexts["scraper"] = ""
+        return f"Error scraping website: {e}", ""
+# ---------------------------------------------------------------------------
+# Tab 2: YouTube Transcript Q&A
+# ---------------------------------------------------------------------------
+_YOUTUBE_ID_REGEX = re.compile(
+    r"(https?://)?(www\.)?"
+    r"(youtube|youtu|youtube-nocookie)\.(com|be)/"
+    r"(watch\?v=|embed/|v/|.+\?v=)?([^&=%\?]{11})"
+)
+def _extract_video_id(url_or_id: str) -> str:
+    match = _YOUTUBE_ID_REGEX.search(url_or_id)
+    if match:
+        return match.group(6)
+    return url_or_id.strip()
+def fetch_youtube_transcript(video_input: str):
+    if not video_input:
+        return "Please enter a YouTube Video URL or ID.", ""
+    video_id = _extract_video_id(video_input)
+    try:
+        api = YouTubeTranscriptApi()
+        transcript_list = api.list(video_id)
+        transcript = None
+        try:
+            transcript = transcript_list.find_transcript(["en", "ur"])
+        except Exception:
+            try:
+                transcript = transcript_list.find_generated_transcript(["en", "ur"])
+            except Exception:
+                for t in transcript_list:
+                    transcript = t
+                    break
+        if transcript is None:
+            raise Exception("No transcript available for this video.")
+        transcript_data = transcript.fetch()
+        pieces = []
+        for t in transcript_data:
+            if isinstance(t, dict):
+                pieces.append(t.get("text", ""))
+            else:
+                pieces.append(getattr(t, "text", ""))
+        transcript_text = " ".join(pieces)
+        contexts["youtube"] = transcript_text[:15000]
+        preview = (
+            transcript_text[:500] + "..."
+            if len(transcript_text) > 500
+            else transcript_text
+        )
+        return (
+            "Transcript fetched successfully. You can now chat about the video.",
+            preview,
+        )
+    except Exception as e:
+        contexts["youtube"] = ""
+        return f"Error: No transcript for video ID ({video_id}). Details: {e}", ""
+# ---------------------------------------------------------------------------
+# Multi-turn chat with persistent storage and user preferences
+# ---------------------------------------------------------------------------
+def _build_system_prompt(mode: str) -> str:
+    context = contexts.get(mode, "")
+    prefs = storage.get("user_preferences", "").strip()
+    ctx_placeholder = "(None — the user has NOT scraped or fetched transcript yet. You must refuse to answer and tell them to scrape/fetch first.)"
+    base = (
+        "You are a helpful assistant. You must use ONLY the provided context to answer. "
+        "NEVER use external knowledge or general information. If the context says 'None' or the user has not scraped yet, refuse to answer and tell them to scrape the website or fetch the transcript first. "
+        "If the answer is not in the context, say so. You have conversation history for follow-up questions (e.g. 'tell me more about the second book').\n\n"
+        f"Context:\n{context.strip() if context else ctx_placeholder}"
+    )
+    if prefs:
+        base += f"\n\nUser preferences (follow these):\n{prefs}"
+    return base
+def chat_turn_scraper(message: str, _history_ignored):
+    """One turn in scraper tab: use storage as source of truth, append message, call API with full history, persist, return new history."""
+    if not message or not message.strip():
+        return "", storage.get("scraper_history", [])
+    context = contexts.get("scraper", "") or ""
+    if not context.strip():
+        history_dicts = list(storage.get("scraper_history", []))
+        history_dicts.append({"role": "user", "content": message.strip()})
+        history_dicts.append(
+            {
+                "role": "assistant",
+                "content": "Please scrape a website first, then ask questions.",
+            }
+        )
+        storage["scraper_history"] = history_dicts
+        save_storage(storage)
+        return "", history_dicts
+    # Use persisted storage as source of truth — not Gradio input — so full history is always kept
+    history_dicts = list(storage.get("scraper_history", []))
+    history_dicts.append({"role": "user", "content": message.strip()})
+    system_prompt = _build_system_prompt("scraper")
+    messages = [{"role": "system", "content": system_prompt}]
+    for m in history_dicts:
+        if m.get("role") in ("user", "assistant"):
+            messages.append({"role": m["role"], "content": m.get("content", "")})
+    try:
+        resp = client.chat.completions.create(
+            model="llama-3.1-8b-instant",
+            messages=messages,
+            max_tokens=1024,
+            temperature=0.3,
+        )
+        reply = resp.choices[0].message.content
+    except Exception as e:
+        reply = f"Error communicating with Groq: {e}"
+    history_dicts.append({"role": "assistant", "content": reply})
+    storage["scraper_history"] = history_dicts
+    save_storage(storage)
+    return "", history_dicts
+def chat_turn_youtube(message: str, _history_ignored):
+    """One turn in YouTube tab: use storage as source of truth, same pattern as scraper."""
+    if not message or not message.strip():
+        return "", storage.get("youtube_history", [])
+    context = contexts.get("youtube", "") or ""
+    if not context.strip():
+        history_dicts = list(storage.get("youtube_history", []))
+        history_dicts.append({"role": "user", "content": message.strip()})
+        history_dicts.append(
+            {
+                "role": "assistant",
+                "content": "Please fetch a YouTube transcript first, then ask questions.",
+            }
+        )
+        storage["youtube_history"] = history_dicts
+        save_storage(storage)
+        return "", history_dicts
+    # Use persisted storage as source of truth
+    history_dicts = list(storage.get("youtube_history", []))
+    history_dicts.append({"role": "user", "content": message.strip()})
+    system_prompt = _build_system_prompt("youtube")
+    messages = [{"role": "system", "content": system_prompt}]
+    for m in history_dicts:
+        if m.get("role") in ("user", "assistant"):
+            messages.append({"role": m["role"], "content": m.get("content", "")})
+    try:
+        resp = client.chat.completions.create(
+            model="llama-3.1-8b-instant",
+            messages=messages,
+            max_tokens=1024,
+            temperature=0.3,
+        )
+        reply = resp.choices[0].message.content
+    except Exception as e:
+        reply = f"Error communicating with Groq: {e}"
+    history_dicts.append({"role": "assistant", "content": reply})
+    storage["youtube_history"] = history_dicts
+    save_storage(storage)
+    return "", history_dicts
+def save_preferences(prefs: str):
+    """Save user preferences to storage and confirm."""
+    global storage
+    storage["user_preferences"] = prefs or ""
+    save_storage(storage)
+    return "Preferences saved. They will be applied to all future replies."
+# ---------------------------------------------------------------------------
+# Gradio UI
+# ---------------------------------------------------------------------------
+# Start with empty chat; history appends as the user sends messages (and is still saved to storage)
+with gr.Blocks(title="Scraper Bot v3 — Multi-Turn Chat with Memory") as demo:
+    gr.Markdown("# Version 3 — Multi-Turn AI Chatbot with Persistent Storage")
+    gr.Markdown(
+        "Chat with memory: conversation history is kept and sent to the AI so you can ask follow-ups. "
+        "History and preferences are saved to `chat_storage.json` and persist across restarts."
+    )
+    with gr.Row():
+        with gr.Column(scale=1):
+            gr.Markdown("### User preferences (editable)")
+            gr.Markdown(
+                "These are injected into the system prompt on every reply. Examples: "
+                "*Always respond formally*, *Cite sources when summarizing*, *Keep answers under 3 sentences*."
+            )
+            prefs_input = gr.Textbox(
+                label="Preferences",
+                value=storage.get("user_preferences", ""),
+                placeholder="e.g., Always respond formally. Cite sources when summarizing.",
+                lines=3,
+            )
+            save_prefs_btn = gr.Button("Save preferences")
+            prefs_status = gr.Textbox(label="Status", interactive=False)
+            save_prefs_btn.click(
+                save_preferences,
+                inputs=[prefs_input],
+                outputs=[prefs_status],
+            )
+        with gr.Column(scale=2):
+            with gr.Tabs():
+                with gr.TabItem("Bot-Protected Website Scraper"):
+                    gr.Markdown(
+                        "Enter a URL to scrape (e.g. Goodreads). Then chat; history is kept for follow-up questions."
+                    )
+                    with gr.Row():
+                        url_input = gr.Textbox(
+                            label="URL",
+                            placeholder="https://www.goodreads.com/",
+                            scale=3,
+                        )
+                        scrape_btn = gr.Button("Scrape URL", scale=1)
+                    scrape_status = gr.Textbox(label="Status", interactive=False)
+                    scrape_preview = gr.Textbox(
+                        label="Content preview", interactive=False, lines=4
+                    )
+                    scraper_chatbot = gr.Chatbot(
+                        value=storage.get("scraper_history", []),
+                        height=400,
+                        label="Chat (multi-turn, persisted)",
+                    )
+                    scraper_msg = gr.Textbox(
+                        label="Message",
+                        placeholder="e.g., What are the top 5 books? Then: Tell me more about the second one.",
+                    )
+                    scrape_btn.click(
+                        scrape_website,
+                        inputs=[url_input],
+                        outputs=[scrape_status, scrape_preview],
+                    )
+                    scraper_msg.submit(
+                        chat_turn_scraper,
+                        inputs=[scraper_msg, scraper_chatbot],
+                        outputs=[scraper_msg, scraper_chatbot],
+                    )
+                with gr.TabItem("YouTube Transcript Q&A"):
+                    gr.Markdown(
+                        "Enter a YouTube video URL or ID to fetch its transcript, then chat with multi-turn memory."
+                    )
+                    with gr.Row():
+                        yt_input = gr.Textbox(
+                            label="YouTube URL or ID",
+                            placeholder="dQw4w9WgXcQ",
+                            scale=3,
+                        )
+                        yt_btn = gr.Button("Get Transcript", scale=1)
+                    yt_status = gr.Textbox(label="Status", interactive=False)
+                    yt_preview = gr.Textbox(
+                        label="Transcript preview", interactive=False, lines=4
+                    )
+                    yt_chatbot = gr.Chatbot(
+                        value=storage.get("youtube_history", []),
+                        height=400,
+                        label="Chat (multi-turn, persisted)",
+                    )
+                    yt_msg = gr.Textbox(
+                        label="Message",
+                        placeholder="e.g., Summarize the video. Then: What are the main takeaways?",
+                    )
+                    yt_btn.click(
+                        fetch_youtube_transcript,
+                        inputs=[yt_input],
+                        outputs=[yt_status, yt_preview],
+                    )
+                    yt_msg.submit(
+                        chat_turn_youtube,
+                        inputs=[yt_msg, yt_chatbot],
+                        outputs=[yt_msg, yt_chatbot],
+                    )
+if __name__ == "__main__":
+    if os.environ.get("SPACE_ID"):
+        demo.launch()  # Hugging Face Spaces: use their host/port
+    else:
+        demo.launch(server_name="127.0.0.1", server_port=7863)  # Local

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+gradio>=4.0.0,<7.0.0
+groq>=0.4.0
+beautifulsoup4>=4.12.0
+requests>=2.28.0
+python-dotenv>=1.0.0
+youtube-transcript-api>=1.0.0

storage.py ADDED Viewed

	@@ -0,0 +1,71 @@

+"""
+Persistent storage for Version 3: conversation history and user preferences.
+Uses a local JSON file (chat_storage.json) for cross-session persistence.
+"""
+import json
+import os
+STORAGE_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)), "chat_storage.json")
+DEFAULT_STORAGE = {
+    "scraper_history": [],
+    "youtube_history": [],
+    "user_preferences": "",
+}
+def load_storage() -> dict:
+    """Load conversation history and user preferences from JSON. Creates file if missing."""
+    if not os.path.exists(STORAGE_PATH):
+        return dict(DEFAULT_STORAGE)
+    try:
+        with open(STORAGE_PATH, "r", encoding="utf-8") as f:
+            data = json.load(f)
+        data.setdefault("scraper_history", [])
+        data.setdefault("youtube_history", [])
+        data.setdefault("user_preferences", "")
+        return data
+    except (json.JSONDecodeError, IOError):
+        return dict(DEFAULT_STORAGE)
+def save_storage(data: dict) -> None:
+    """Write storage to JSON."""
+    with open(STORAGE_PATH, "w", encoding="utf-8") as f:
+        json.dump(data, f, indent=2, ensure_ascii=False)
+def history_dicts_to_tuples(history: list) -> list:
+    """Convert [{"role":"user","content":...},{"role":"assistant","content":...}, ...] to [(u,a), ...] for Gradio Chatbot."""
+    tuples = []
+    i = 0
+    while i < len(history):
+        msg = history[i]
+        if not isinstance(msg, dict):
+            i += 1
+            continue
+        role = msg.get("role")
+        content = msg.get("content") or ""
+        if role == "user":
+            next_msg = history[i + 1] if i + 1 < len(history) else None
+            if next_msg and isinstance(next_msg, dict) and next_msg.get("role") == "assistant":
+                tuples.append((content, next_msg.get("content") or ""))
+                i += 2
+            else:
+                tuples.append((content, ""))
+                i += 1
+        else:
+            i += 1
+    return tuples
+def history_tuples_to_dicts(tuples: list) -> list:
+    """Convert Gradio Chatbot [(user, assistant), ...] to [{"role":"user",...},{"role":"assistant",...}, ...]."""
+    out = []
+    for user_msg, assistant_msg in tuples or []:
+        if user_msg:
+            out.append({"role": "user", "content": str(user_msg)})
+        if assistant_msg:
+            out.append({"role": "assistant", "content": str(assistant_msg)})
+    return out