Spaces:
Runtime error
Runtime error
| import threading | |
| import queue | |
| import time | |
| import json | |
| import wikipedia | |
| from sentence_transformers import SentenceTransformer | |
| import faiss | |
| import gradio as gr | |
| import numpy as np | |
| import logging | |
| # Globals | |
| TOPICS = ["Art", "Science", "Technology", "Movies", "Sports", "Politics"] | |
| embedding_model = SentenceTransformer('all-MiniLM-L6-v2') | |
| index = faiss.IndexFlatL2(384) | |
| faiss_store = [] | |
| metadatas = [] | |
| # In-memory queue replacing Kafka | |
| article_queue = queue.Queue() | |
| # Links for sidebar | |
| hf_links = [ | |
| ("AI Reasoning Copilot", "https://huggingface.co/spaces/faisalsns/ai-reasoning-copilot"), | |
| ("Language Detection Compare Models", "https://huggingface.co/spaces/faisalsns/language-detection-compare-models/"), | |
| ("Prompt Playground v1 - Compare Models Output", "https://huggingface.co/spaces/faisalsns/prompt-canvas-engine"), | |
| ("Mental Disorders Symptoms", "https://huggingface.co/spaces/faisalsns/mental-disorders-symptoms") | |
| ] | |
| def get_links(): | |
| otherlinks = "<br>".join([f"[{name}]({url})" for name, url in hf_links]) | |
| return f"### Other Applications To Explore!\n{otherlinks}" | |
| # Wikipedia disambiguation handling | |
| def get_wikipedia_page(topic): | |
| """Get Wikipedia page with disambiguation handling""" | |
| try: | |
| return wikipedia.page(topic) | |
| except wikipedia.exceptions.DisambiguationError as e: | |
| # Try the first option from disambiguation | |
| return wikipedia.page(e.options[0]) | |
| except wikipedia.exceptions.PageError: | |
| # Try searching for alternatives | |
| search_results = wikipedia.search(topic, results=1) | |
| if search_results: | |
| return wikipedia.page(search_results[0]) | |
| raise Exception(f"No page found for topic: {topic}") | |
| # Agent 1: Scrape articles and push to queue | |
| def agent1_scrape_and_publish(selected_topics, count): | |
| if not selected_topics: | |
| return "Please select at least one topic." | |
| titles = [] | |
| count = min(count, 5) # Limit to prevent overload | |
| for topic in selected_topics: | |
| try: | |
| # Get the main page for the topic | |
| page = get_wikipedia_page(topic) | |
| content = page.content[:3000] # Limit content size | |
| article_queue.put({ | |
| "title": page.title, | |
| "content": content | |
| }) | |
| titles.append(page.title) | |
| # Also get related articles if count > 1 | |
| if count > 1: | |
| try: | |
| search_results = wikipedia.search(topic, results=count-1) | |
| for result in search_results[:count-1]: | |
| try: | |
| sub_page = wikipedia.page(result) | |
| sub_content = sub_page.content[:3000] | |
| article_queue.put({ | |
| "title": sub_page.title, | |
| "content": sub_content | |
| }) | |
| titles.append(sub_page.summary) | |
| except Exception: | |
| continue | |
| except Exception: | |
| pass | |
| except Exception as e: | |
| logging.error(f"Error fetching {topic}: {e}") | |
| titles.append(f"ERROR: {topic} - {str(e)}") | |
| success_count = len([t for t in titles if not t.startswith('ERROR')]) | |
| return f"Scraped {success_count} articles:\n" + "\n".join(titles) | |
| # Agent 2: Consume from queue and index | |
| def agent2_consume_and_index(): | |
| while True: | |
| try: | |
| article = article_queue.get(timeout=1) | |
| # Skip if already indexed | |
| if any(meta["title"] == article["title"] for meta in metadatas): | |
| continue | |
| content = article["content"] | |
| # Create embedding | |
| vector = embedding_model.encode(content) | |
| # Add to FAISS index | |
| faiss_store.append(vector) | |
| metadatas.append({ | |
| "title": article["title"], | |
| "content": content | |
| }) | |
| # Add to FAISS index (reshape to 2D array) | |
| index.add(np.array([vector])) | |
| logging.info(f"Indexed: {article['title']}") | |
| except queue.Empty: | |
| time.sleep(0.5) | |
| continue | |
| except Exception as e: | |
| logging.error(f"Error indexing article: {e}") | |
| continue | |
| # QA function | |
| def ask_question(question): | |
| if not faiss_store: | |
| return "Index is empty. Please scrape some articles first by selecting topics and clicking 'Generate from Wikipedia'." | |
| if not question or not question.strip(): | |
| return "Please enter a question." | |
| try: | |
| # Create query embedding | |
| query_vector = embedding_model.encode(question) | |
| # Search FAISS index | |
| D, I = index.search(np.array([query_vector]), k=3) | |
| if len(I[0]) == 0 or I[0][0] == -1: | |
| return "No relevant articles found for your question." | |
| # Get relevant content | |
| relevant_articles = [] | |
| for idx, score in zip(I[0], D[0]): | |
| if idx >= 0 and idx < len(metadatas): | |
| try: | |
| title = metadatas[idx]["title"] | |
| content = metadatas[idx]["content"] | |
| # Limit response length safely | |
| content_preview = content[:500] if len(content) > 500 else content | |
| relevant_articles.append(f"**{title}**:\n{content_preview}...") | |
| except (KeyError, IndexError) as e: | |
| logging.error(f"Error accessing metadata at index {idx}: {e}") | |
| continue | |
| if not relevant_articles: | |
| return "No relevant articles found." | |
| return f"Found {len(relevant_articles)} relevant articles:\n\n" + "\n\n".join(relevant_articles) | |
| except Exception as e: | |
| logging.error(f"Error in ask_question: {e}") | |
| return f"Error processing your question: {str(e)}" | |
| # Status function | |
| def get_status(): | |
| return f"Indexed Articles: {len(metadatas)}\nQueue Size: {article_queue.qsize()}" | |
| # Gradio UI | |
| with gr.Blocks(title="AI Wikipedia Copilot") as demo: | |
| gr.Markdown("## AI Copilot for Wikipedia") | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| topic_list = gr.CheckboxGroup( | |
| choices=TOPICS, | |
| label="Select Topics", | |
| value=["Science"] # Default selection | |
| ) | |
| topic_count = gr.Slider( | |
| minimum=1, | |
| maximum=10, | |
| step=1, | |
| value=2, | |
| label="Articles per Topic" | |
| ) | |
| scrape_btn = gr.Button("Generate from Wikipedia", variant="primary") | |
| status_box = gr.Textbox(label="Status", lines=2) | |
| output_titles = gr.Textbox(label="Scraped Articles", lines=8) | |
| with gr.Column(scale=2): | |
| question_box = gr.Textbox( | |
| label="Ask a question about the scraped articles", | |
| placeholder="What is artificial intelligence?", | |
| lines=2 | |
| ) | |
| submit_btn = gr.Button("Submit Question", variant="primary") | |
| answer_box = gr.Textbox(label="Answer", lines=10) | |
| gr.Markdown( | |
| """ | |
| **How it works:** | |
| 1. Select topics and click 'Generate from Wikipedia' to scrape articles | |
| 2. Wait for indexing to complete (check status) | |
| 3. Ask questions about the scraped content | |
| **Note:** I am on a self-directed AI journey. This app scrapes Wikipedia, | |
| indexes articles in FAISS, and answers questions using embeddings. | |
| There's no better way to learn than build it yourself 🚀 | |
| """ | |
| ) | |
| gr.Markdown(get_links()) | |
| # Event handlers | |
| scrape_btn.click( | |
| fn=agent1_scrape_and_publish, | |
| inputs=[topic_list, topic_count], | |
| outputs=output_titles | |
| ) | |
| submit_btn.click( | |
| fn=ask_question, | |
| inputs=question_box, | |
| outputs=answer_box | |
| ) | |
| # Auto-refresh status every 2 seconds | |
| status_timer = gr.Timer(2.0) | |
| status_timer.tick(fn=get_status, outputs=status_box) | |
| if __name__ == "__main__": | |
| # Set up logging | |
| logging.basicConfig(level=logging.INFO) | |
| # Start the background indexing thread | |
| indexing_thread = threading.Thread(target=agent2_consume_and_index, daemon=True) | |
| indexing_thread.start() | |
| # Launch the app | |
| demo.launch(server_name="0.0.0.0", server_port=7860) |