Spaces:

Ariyan-Pro
/

rag-latency-optimization

Sleeping

File size: 1,926 Bytes

04ab625

#!/usr/bin/env python3
"""Add more documents to scale the system."""
import sys
from pathlib import Path
sys.path.insert(0, str(Path(__file__).parent.parent))

from config import DATA_DIR
import requests

def download_wikipedia_articles():
    """Download sample Wikipedia articles for scaling."""
    topics = [
        "Artificial_intelligence",
        "Machine_learning", 
        "Python_(programming_language)",
        "Natural_language_processing",
        "Computer_vision",
        "Deep_learning",
        "Data_science",
        "Big_data",
        "Cloud_computing",
        "Web_development"
    ]
    
    print(f"Downloading Wikipedia articles to {DATA_DIR}...")
    
    for topic in topics:
        url = f"https://en.wikipedia.org/w/index.php?title={topic}&printable=yes"
        try:
            response = requests.get(url, timeout=10)
            if response.status_code == 200:
                # Simple extraction of main content
                content = response.text
                # Extract between <p> tags for simple text
                import re
                paragraphs = re.findall(r'<p>(.*?)</p>', content, re.DOTALL)
                if paragraphs:
                    text = '\n\n'.join([re.sub(r'<.*?>', '', p) for p in paragraphs[:10]])
                    file_path = DATA_DIR / f"wikipedia_{topic}.txt"
                    with open(file_path, 'w', encoding='utf-8') as f:
                        f.write(f"# {topic.replace('_', ' ')}\n\n")
                        f.write(text[:5000])  # Limit size
                    print(f"  Downloaded: {file_path}")
        except Exception as e:
            print(f"  Failed to download {topic}: {e}")
    
    print(f"\nTotal files in data directory: {len(list(DATA_DIR.glob('*.txt')))}")
    print("Run 'python scripts/initialize_rag.py' to rebuild index with new documents")

if __name__ == "__main__":
    download_wikipedia_articles()