rag-latency-optimization / scripts /download_wikipedia.py
Ariyan-Pro's picture
Deploy RAG Latency Optimization v1.0
04ab625
#!/usr/bin/env python3
"""Add more documents to scale the system."""
import sys
from pathlib import Path
sys.path.insert(0, str(Path(__file__).parent.parent))
from config import DATA_DIR
import requests
def download_wikipedia_articles():
"""Download sample Wikipedia articles for scaling."""
topics = [
"Artificial_intelligence",
"Machine_learning",
"Python_(programming_language)",
"Natural_language_processing",
"Computer_vision",
"Deep_learning",
"Data_science",
"Big_data",
"Cloud_computing",
"Web_development"
]
print(f"Downloading Wikipedia articles to {DATA_DIR}...")
for topic in topics:
url = f"https://en.wikipedia.org/w/index.php?title={topic}&printable=yes"
try:
response = requests.get(url, timeout=10)
if response.status_code == 200:
# Simple extraction of main content
content = response.text
# Extract between <p> tags for simple text
import re
paragraphs = re.findall(r'<p>(.*?)</p>', content, re.DOTALL)
if paragraphs:
text = '\n\n'.join([re.sub(r'<.*?>', '', p) for p in paragraphs[:10]])
file_path = DATA_DIR / f"wikipedia_{topic}.txt"
with open(file_path, 'w', encoding='utf-8') as f:
f.write(f"# {topic.replace('_', ' ')}\n\n")
f.write(text[:5000]) # Limit size
print(f" Downloaded: {file_path}")
except Exception as e:
print(f" Failed to download {topic}: {e}")
print(f"\nTotal files in data directory: {len(list(DATA_DIR.glob('*.txt')))}")
print("Run 'python scripts/initialize_rag.py' to rebuild index with new documents")
if __name__ == "__main__":
download_wikipedia_articles()