Spaces:
Sleeping
Sleeping
| #!/usr/bin/env python3 | |
| """Add more documents to scale the system.""" | |
| import sys | |
| from pathlib import Path | |
| sys.path.insert(0, str(Path(__file__).parent.parent)) | |
| from config import DATA_DIR | |
| import requests | |
| def download_wikipedia_articles(): | |
| """Download sample Wikipedia articles for scaling.""" | |
| topics = [ | |
| "Artificial_intelligence", | |
| "Machine_learning", | |
| "Python_(programming_language)", | |
| "Natural_language_processing", | |
| "Computer_vision", | |
| "Deep_learning", | |
| "Data_science", | |
| "Big_data", | |
| "Cloud_computing", | |
| "Web_development" | |
| ] | |
| print(f"Downloading Wikipedia articles to {DATA_DIR}...") | |
| for topic in topics: | |
| url = f"https://en.wikipedia.org/w/index.php?title={topic}&printable=yes" | |
| try: | |
| response = requests.get(url, timeout=10) | |
| if response.status_code == 200: | |
| # Simple extraction of main content | |
| content = response.text | |
| # Extract between <p> tags for simple text | |
| import re | |
| paragraphs = re.findall(r'<p>(.*?)</p>', content, re.DOTALL) | |
| if paragraphs: | |
| text = '\n\n'.join([re.sub(r'<.*?>', '', p) for p in paragraphs[:10]]) | |
| file_path = DATA_DIR / f"wikipedia_{topic}.txt" | |
| with open(file_path, 'w', encoding='utf-8') as f: | |
| f.write(f"# {topic.replace('_', ' ')}\n\n") | |
| f.write(text[:5000]) # Limit size | |
| print(f" Downloaded: {file_path}") | |
| except Exception as e: | |
| print(f" Failed to download {topic}: {e}") | |
| print(f"\nTotal files in data directory: {len(list(DATA_DIR.glob('*.txt')))}") | |
| print("Run 'python scripts/initialize_rag.py' to rebuild index with new documents") | |
| if __name__ == "__main__": | |
| download_wikipedia_articles() | |