#!/usr/bin/env python3 """Add more documents to scale the system.""" import sys from pathlib import Path sys.path.insert(0, str(Path(__file__).parent.parent)) from config import DATA_DIR import requests def download_wikipedia_articles(): """Download sample Wikipedia articles for scaling.""" topics = [ "Artificial_intelligence", "Machine_learning", "Python_(programming_language)", "Natural_language_processing", "Computer_vision", "Deep_learning", "Data_science", "Big_data", "Cloud_computing", "Web_development" ] print(f"Downloading Wikipedia articles to {DATA_DIR}...") for topic in topics: url = f"https://en.wikipedia.org/w/index.php?title={topic}&printable=yes" try: response = requests.get(url, timeout=10) if response.status_code == 200: # Simple extraction of main content content = response.text # Extract between

tags for simple text import re paragraphs = re.findall(r'

(.*?)

', content, re.DOTALL) if paragraphs: text = '\n\n'.join([re.sub(r'<.*?>', '', p) for p in paragraphs[:10]]) file_path = DATA_DIR / f"wikipedia_{topic}.txt" with open(file_path, 'w', encoding='utf-8') as f: f.write(f"# {topic.replace('_', ' ')}\n\n") f.write(text[:5000]) # Limit size print(f" Downloaded: {file_path}") except Exception as e: print(f" Failed to download {topic}: {e}") print(f"\nTotal files in data directory: {len(list(DATA_DIR.glob('*.txt')))}") print("Run 'python scripts/initialize_rag.py' to rebuild index with new documents") if __name__ == "__main__": download_wikipedia_articles()