Spaces:

Ariyan-Pro
/

rag-latency-optimization

Sleeping

rag-latency-optimization / scripts /download_wikipedia.py

Deploy RAG Latency Optimization v1.0

04ab625 14 days ago

1.93 kB

	#!/usr/bin/env python3
	"""Add more documents to scale the system."""
	import sys
	from pathlib import Path
	sys.path.insert(0, str(Path(__file__).parent.parent))

	from config import DATA_DIR
	import requests

	def download_wikipedia_articles():
	"""Download sample Wikipedia articles for scaling."""
	topics = [
	"Artificial_intelligence",
	"Machine_learning",
	"Python_(programming_language)",
	"Natural_language_processing",
	"Computer_vision",
	"Deep_learning",
	"Data_science",
	"Big_data",
	"Cloud_computing",
	"Web_development"
	]

	print(f"Downloading Wikipedia articles to {DATA_DIR}...")

	for topic in topics:
	url = f"https://en.wikipedia.org/w/index.php?title={topic}&printable=yes"
	try:
	response = requests.get(url, timeout=10)
	if response.status_code == 200:
	# Simple extraction of main content
	content = response.text
	# Extract between <p> tags for simple text
	import re
	paragraphs = re.findall(r'<p>(.*?)</p>', content, re.DOTALL)
	if paragraphs:
	text = '\n\n'.join([re.sub(r'<.*?>', '', p) for p in paragraphs[:10]])
	file_path = DATA_DIR / f"wikipedia_{topic}.txt"
	with open(file_path, 'w', encoding='utf-8') as f:
	f.write(f"# {topic.replace('_', ' ')}\n\n")
	f.write(text[:5000]) # Limit size
	print(f" Downloaded: {file_path}")
	except Exception as e:
	print(f" Failed to download {topic}: {e}")

	print(f"\nTotal files in data directory: {len(list(DATA_DIR.glob('*.txt')))}")
	print("Run 'python scripts/initialize_rag.py' to rebuild index with new documents")

	if __name__ == "__main__":
	download_wikipedia_articles()