Spaces:

faisalsns
/

ai-copilot-wikipedia

Runtime error

App Files Files Community

faisalsns commited on Aug 13, 2025

Commit

70344e5

1 Parent(s): 54cbbae

Initial commit for app

Browse files

Files changed (4) hide show

Dockerfile +12 -0
docker-compose.yml +46 -0
main.py +249 -0
requirements.txt +24 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,12 @@

+# Dockerfile
+FROM python:3.10-slim
+WORKDIR /app
+COPY requirements.txt ./
+RUN pip install --no-cache-dir -r requirements.txt
+COPY . .
+CMD ["python", "main.py"]

docker-compose.yml ADDED Viewed

	@@ -0,0 +1,46 @@

+version: '3.8'
+services:
+  zookeeper:
+    image: bitnami/zookeeper:latest
+    container_name: zookeeper
+    ports:
+      - "2181:2181"
+    environment:
+      - ALLOW_ANONYMOUS_LOGIN=yes
+    volumes:
+      - zookeeper_data:/bitnami
+  kafka:
+    image: bitnami/kafka:3.4
+    container_name: kafka
+    ports:
+      - "9094:9094" # external
+      - "9092:9092" # internal
+    environment:
+      - KAFKA_CFG_ZOOKEEPER_CONNECT=zookeeper:2181
+      - KAFKA_CFG_LISTENERS=INTERNAL://0.0.0.0:9092,EXTERNAL://0.0.0.0:9094
+      - KAFKA_CFG_ADVERTISED_LISTENERS=INTERNAL://kafka:9092,EXTERNAL://localhost:9094
+      - KAFKA_CFG_LISTENER_SECURITY_PROTOCOL_MAP=INTERNAL:PLAINTEXT,EXTERNAL:PLAINTEXT
+      - KAFKA_INTER_BROKER_LISTENER_NAME=INTERNAL
+      - ALLOW_PLAINTEXT_LISTENER=yes
+      - KAFKA_CFG_BROKER_ID=1
+    depends_on:
+      - zookeeper
+    volumes:
+      - kafka_data:/bitnami
+  app:
+    build: .
+    ports:
+      - "7860:7860"
+      - "5678:5678"
+    depends_on:
+      - kafka
+    environment:
+      - PYTHONUNBUFFERED=1
+    volumes:
+      - .:/app
+volumes:
+  zookeeper_data:
+  kafka_data:

main.py ADDED Viewed

	@@ -0,0 +1,249 @@

+# main.py
+import gradio as gr
+import wikipedia
+import json
+import uuid
+from kafka import KafkaProducer, KafkaConsumer
+from sentence_transformers import SentenceTransformer
+import faiss
+import numpy as np
+from langchain.vectorstores import FAISS
+from langchain.embeddings import HuggingFaceEmbeddings
+from langchain.chains import RetrievalQA
+from langchain.llms import HuggingFaceHub
+from wikipedia.exceptions import DisambiguationError, PageError
+import os
+from dotenv import load_dotenv
+load_dotenv()
+HUGGINGFACEHUB_API_TOKEN = os.getenv("HUGGINGFACEHUB_API_TOKEN")
+# Globals
+TOPICS = ["Art", "Science", "Technology", "Movies", "Sports", "Politics"]
+producer = KafkaProducer(bootstrap_servers='kafka:9092', value_serializer=lambda v: json.dumps(v).encode('utf-8'))
+consumer = KafkaConsumer('wiki-topic', bootstrap_servers='kafka:9092', value_deserializer=lambda m: json.loads(m.decode('utf-8')))
+embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
+index = faiss.IndexFlatL2(384)
+faiss_store = []
+metadatas = []
+# Agent 1 - Scrape Wikipedia and produce to Kafka
+import logging
+logging.basicConfig(level=logging.INFO)
+def agent1_scrape_and_publish(selected_topics, count):
+    count = min(count, 20)
+    results = []
+    # Topic mappings for your specific topics to handle disambiguation
+    topic_mappings = {
+        'Arts': ['Art', 'The arts', 'Visual arts'],
+        'Science': ['Science', 'Natural science', 'Scientific method'],
+        'Sports': ['Sport', 'Sports', 'Athletic sports'],
+        'Movies': ['Film', 'Cinema', 'Movie'],
+        'Technology': ['Technology', 'Information technology', 'Modern technology']
+    }
+    def get_wikipedia_page(topic):
+        """Get Wikipedia page with disambiguation handling for your specific topics"""
+        # Try the exact topic first
+        try:
+            return wikipedia.page(topic)
+        except DisambiguationError as e:
+            logging.info(f"Disambiguation found for '{topic}'. Options: {e.options[:3]}")
+            # Try mapped alternatives first
+            if topic in topic_mappings:
+                for alternative in topic_mappings[topic]:
+                    try:
+                        page = wikipedia.page(alternative)
+                        logging.info(f"Successfully resolved '{topic}' to '{page.title}'")
+                        return page
+                    except (DisambiguationError, PageError):
+                        continue
+            # Try the first few disambiguation options
+            for option in e.options[:3]:
+                try:
+                    page = wikipedia.page(option)
+                    logging.info(f"Used disambiguation option '{option}' for '{topic}'")
+                    return page
+                except (DisambiguationError, PageError):
+                    continue
+            raise Exception(f"Could not resolve disambiguation for '{topic}'")
+        except PageError as e:
+            logging.warning(f"Page not found for '{topic}', trying search...")
+            # Try searching for alternatives
+            search_results = wikipedia.search(topic, results=3)
+            if search_results:
+                for result in search_results:
+                    try:
+                        page = wikipedia.page(result)
+                        logging.info(f"Found alternative '{result}' for '{topic}'")
+                        return page
+                    except Exception:
+                        continue
+            raise Exception(f"No suitable page found for '{topic}'")
+    for topic in selected_topics[:count]:
+        try:
+            logging.info(f"Processing topic: '{topic}'")
+            page = get_wikipedia_page(topic)
+            content = {
+                'id': str(uuid.uuid4()),
+                'title': page.title,
+                'content': page.content[:3000],
+                'original_topic': topic,  # Keep track of original request
+                'url': page.url,
+                'summary': page.summary[:500]  # Add summary for better context
+            }
+            # Send to Kafka
+            future = producer.send('wiki-topic', content)
+            record_metadata = future.get(timeout=10)
+            logging.info(f"Successfully published: '{topic}' -> '{page.title}' to {record_metadata.topic}:{record_metadata.partition}")
+            #results.append(f"{topic} -> {page.title}")
+            results.append(f"{topic} -> {page.summary[:50]}...")  # Add summary snippet
+        except Exception as e:
+            error_msg = f"Failed to process topic '{topic}': {e}"
+            logging.error(error_msg)
+            results.append(f"{topic} -> ERROR: {str(e)}")
+            # Continue with other topics instead of stopping
+    producer.flush()
+    return "\n".join(results)
+# Agent 2 - Consume, format, vectorize, store
+def agent2_consume_and_index():
+    for message in consumer:
+        data = message.value
+        title = data['title'][:100]
+        content = data['content']
+        intro = content[:200]
+        desc = content[200:500]
+        combined = f"{title}. {intro}. {desc}"
+        emb = embedding_model.encode([combined])
+        index.add(np.array(emb))
+        faiss_store.append(combined)
+        metadatas.append({"title": title})
+        if len(faiss_store) >= 20:
+            break
+from transformers import pipeline
+from langchain.embeddings import HuggingFaceEmbeddings
+from langchain.vectorstores import FAISS
+def ask_question(question, min_confidence=0.3):
+    if not faiss_store:
+        return "Index empty. Run Agent 1 and 2 first."
+    try:
+        # Initialize embeddings
+        embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
+        # Create FAISS database from stored texts
+        db = FAISS.from_texts(faiss_store, embeddings, metadatas=metadatas)
+        # Perform similarity search - increase k for more context
+        relevant_docs = db.similarity_search_with_score(question, k=5)
+        # Filter by similarity score (lower is better for FAISS cosine distance)
+        filtered_docs = [doc for doc, score in relevant_docs if score < 0.8]  # Adjust threshold
+        if not filtered_docs:
+            return "No relevant information found in the knowledge base."
+        # Create comprehensive context
+        context = "\n\n".join([doc.page_content for doc in filtered_docs])
+        # Method 1: Use a generative model (RECOMMENDED)
+        from transformers import pipeline, AutoTokenizer
+        # Use a better model for text generation
+        generator = pipeline(
+            "text2text-generation",
+            model="google/flan-t5-base",  # Better for Q&A tasks
+            tokenizer="google/flan-t5-base"
+        )
+        # Create a better prompt
+        prompt = f"""Answer the following question based on the provided context. If the context doesn't contain enough information, say so.
+Context: {context[:2000]}  # Truncate to avoid token limits
+Question: {question}
+Answer:"""
+        response = generator(
+            prompt,
+            max_length=200,
+            min_length=20,
+            temperature=0.7,
+            do_sample=True
+        )
+        return response[0]['generated_text'].strip()
+    except Exception as e:
+        return f"Error processing question: {str(e)}"
+hf_links = [
+    ("AI Reasoning Copilot", "https://huggingface.co/spaces/faisalsns/ai-reasoning-copilot"),
+    ("Language Detection Compare Models", "https://huggingface.co/spaces/faisalsns/language-detection-compare-models/"),
+    ("Prompt Playground v1 - Compare Models Output", "https://huggingface.co/spaces/faisalsns/prompt-canvas-engine"),
+    ("Mental Disorders Symptoms", "https://huggingface.co/spaces/faisalsns/mental-disorders-symptoms")
+]
+def get_links():
+    otherlinks = "<br>".join([f"[{name}]({url})" for name, url in hf_links])
+    return f"### Other Applications To Explore!\n{otherlinks}"
+# Gradio UI
+with gr.Blocks() as demo:
+    gr.Markdown("## AI Copilot for Wikipedia")
+    with gr.Row():
+        with gr.Column(scale=1):
+            topic_list = gr.CheckboxGroup(choices=TOPICS, label="Select Topics")
+            topic_count = gr.Slider(1, 20, step=1, label="Number of Topics to Scrape")
+            scrape_btn = gr.Button("Generate from Wikipedia")
+            output_titles = gr.Textbox(label="Article Titles", lines=6)
+        with gr.Column(scale=2):
+            question_box = gr.Textbox(label="Ask a question")
+            submit_btn = gr.Button("Submit")
+            answer_box = gr.Textbox(label="Answer")
+            gr.Markdown(
+                """
+                **A Note -**
+                I am on a self directed AI journey and for this project I am building an AI Copilot that scrapes content from Wikipedia for user selected category of articles, index them and then answer questions based on that content.
+                I always wanted to use Kafka for messages so using that in this project. Additionally I am using FAISS storage since I have already tried ChromaDB earlier. Lastly using HuggingFace models for embeddings and answering questions.
+                There's no better way to learn than build it yourself 🚀
+                """
+            )
+            gr.Markdown(get_links())
+    scrape_btn.click(fn=agent1_scrape_and_publish, inputs=[topic_list, topic_count], outputs=output_titles)
+    submit_btn.click(fn=ask_question, inputs=question_box, outputs=answer_box)
+if __name__ == "__main__":
+    import threading
+    threading.Thread(target=agent2_consume_and_index, daemon=True).start()
+    demo.launch(server_name="0.0.0.0", server_port=7860)

requirements.txt ADDED Viewed

	@@ -0,0 +1,24 @@

+# requirements.txt
+# gradio
+# wikipedia
+# kafka-python
+# sentence-transformers
+# faiss-cpu
+# langchain
+# huggingface-hub
+# numpy
+gradio<4.0.0
+wikipedia
+kafka-python
+sentence-transformers==2.2.2
+faiss-cpu
+langchain==0.1.14
+huggingface-hub==0.19.3
+numpy
+pydantic<2.0.0
+debugpy
+python-dotenv
+huggingface_hub
+langchain-community>=0.0.30,<0.1