faisalsns's picture
Update app.py
a8688fc verified
import threading
import queue
import time
import json
import wikipedia
from sentence_transformers import SentenceTransformer
import faiss
import gradio as gr
import numpy as np
import logging
# Globals
TOPICS = ["Art", "Science", "Technology", "Movies", "Sports", "Politics"]
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
index = faiss.IndexFlatL2(384)
faiss_store = []
metadatas = []
# In-memory queue replacing Kafka
article_queue = queue.Queue()
# Links for sidebar
hf_links = [
("AI Reasoning Copilot", "https://huggingface.co/spaces/faisalsns/ai-reasoning-copilot"),
("Language Detection Compare Models", "https://huggingface.co/spaces/faisalsns/language-detection-compare-models/"),
("Prompt Playground v1 - Compare Models Output", "https://huggingface.co/spaces/faisalsns/prompt-canvas-engine"),
("Mental Disorders Symptoms", "https://huggingface.co/spaces/faisalsns/mental-disorders-symptoms")
]
def get_links():
otherlinks = "<br>".join([f"[{name}]({url})" for name, url in hf_links])
return f"### Other Applications To Explore!\n{otherlinks}"
# Wikipedia disambiguation handling
def get_wikipedia_page(topic):
"""Get Wikipedia page with disambiguation handling"""
try:
return wikipedia.page(topic)
except wikipedia.exceptions.DisambiguationError as e:
# Try the first option from disambiguation
return wikipedia.page(e.options[0])
except wikipedia.exceptions.PageError:
# Try searching for alternatives
search_results = wikipedia.search(topic, results=1)
if search_results:
return wikipedia.page(search_results[0])
raise Exception(f"No page found for topic: {topic}")
# Agent 1: Scrape articles and push to queue
def agent1_scrape_and_publish(selected_topics, count):
if not selected_topics:
return "Please select at least one topic."
titles = []
count = min(count, 5) # Limit to prevent overload
for topic in selected_topics:
try:
# Get the main page for the topic
page = get_wikipedia_page(topic)
content = page.content[:3000] # Limit content size
article_queue.put({
"title": page.title,
"content": content
})
titles.append(page.title)
# Also get related articles if count > 1
if count > 1:
try:
search_results = wikipedia.search(topic, results=count-1)
for result in search_results[:count-1]:
try:
sub_page = wikipedia.page(result)
sub_content = sub_page.content[:3000]
article_queue.put({
"title": sub_page.title,
"content": sub_content
})
titles.append(sub_page.summary)
except Exception:
continue
except Exception:
pass
except Exception as e:
logging.error(f"Error fetching {topic}: {e}")
titles.append(f"ERROR: {topic} - {str(e)}")
success_count = len([t for t in titles if not t.startswith('ERROR')])
return f"Scraped {success_count} articles:\n" + "\n".join(titles)
# Agent 2: Consume from queue and index
def agent2_consume_and_index():
while True:
try:
article = article_queue.get(timeout=1)
# Skip if already indexed
if any(meta["title"] == article["title"] for meta in metadatas):
continue
content = article["content"]
# Create embedding
vector = embedding_model.encode(content)
# Add to FAISS index
faiss_store.append(vector)
metadatas.append({
"title": article["title"],
"content": content
})
# Add to FAISS index (reshape to 2D array)
index.add(np.array([vector]))
logging.info(f"Indexed: {article['title']}")
except queue.Empty:
time.sleep(0.5)
continue
except Exception as e:
logging.error(f"Error indexing article: {e}")
continue
# QA function
def ask_question(question):
if not faiss_store:
return "Index is empty. Please scrape some articles first by selecting topics and clicking 'Generate from Wikipedia'."
if not question or not question.strip():
return "Please enter a question."
try:
# Create query embedding
query_vector = embedding_model.encode(question)
# Search FAISS index
D, I = index.search(np.array([query_vector]), k=3)
if len(I[0]) == 0 or I[0][0] == -1:
return "No relevant articles found for your question."
# Get relevant content
relevant_articles = []
for idx, score in zip(I[0], D[0]):
if idx >= 0 and idx < len(metadatas):
try:
title = metadatas[idx]["title"]
content = metadatas[idx]["content"]
# Limit response length safely
content_preview = content[:500] if len(content) > 500 else content
relevant_articles.append(f"**{title}**:\n{content_preview}...")
except (KeyError, IndexError) as e:
logging.error(f"Error accessing metadata at index {idx}: {e}")
continue
if not relevant_articles:
return "No relevant articles found."
return f"Found {len(relevant_articles)} relevant articles:\n\n" + "\n\n".join(relevant_articles)
except Exception as e:
logging.error(f"Error in ask_question: {e}")
return f"Error processing your question: {str(e)}"
# Status function
def get_status():
return f"Indexed Articles: {len(metadatas)}\nQueue Size: {article_queue.qsize()}"
# Gradio UI
with gr.Blocks(title="AI Wikipedia Copilot") as demo:
gr.Markdown("## AI Copilot for Wikipedia")
with gr.Row():
with gr.Column(scale=1):
topic_list = gr.CheckboxGroup(
choices=TOPICS,
label="Select Topics",
value=["Science"] # Default selection
)
topic_count = gr.Slider(
minimum=1,
maximum=10,
step=1,
value=2,
label="Articles per Topic"
)
scrape_btn = gr.Button("Generate from Wikipedia", variant="primary")
status_box = gr.Textbox(label="Status", lines=2)
output_titles = gr.Textbox(label="Scraped Articles", lines=8)
with gr.Column(scale=2):
question_box = gr.Textbox(
label="Ask a question about the scraped articles",
placeholder="What is artificial intelligence?",
lines=2
)
submit_btn = gr.Button("Submit Question", variant="primary")
answer_box = gr.Textbox(label="Answer", lines=10)
gr.Markdown(
"""
**How it works:**
1. Select topics and click 'Generate from Wikipedia' to scrape articles
2. Wait for indexing to complete (check status)
3. Ask questions about the scraped content
**Note:** I am on a self-directed AI journey. This app scrapes Wikipedia,
indexes articles in FAISS, and answers questions using embeddings.
There's no better way to learn than build it yourself 🚀
"""
)
gr.Markdown(get_links())
# Event handlers
scrape_btn.click(
fn=agent1_scrape_and_publish,
inputs=[topic_list, topic_count],
outputs=output_titles
)
submit_btn.click(
fn=ask_question,
inputs=question_box,
outputs=answer_box
)
# Auto-refresh status every 2 seconds
status_timer = gr.Timer(2.0)
status_timer.tick(fn=get_status, outputs=status_box)
if __name__ == "__main__":
# Set up logging
logging.basicConfig(level=logging.INFO)
# Start the background indexing thread
indexing_thread = threading.Thread(target=agent2_consume_and_index, daemon=True)
indexing_thread.start()
# Launch the app
demo.launch(server_name="0.0.0.0", server_port=7860)