Spaces:

Ariyan-Pro
/

rag-latency-optimization

Running

App Files Files Community

rag-latency-optimization / scripts /download_sample_data.py

Ariyan-Pro

Deploy RAG Latency Optimization v1.0

04ab625 3 days ago

raw

history blame contribute delete

4.23 kB

	#!/usr/bin/env python3
	"""
	Download sample documents for testing.
	"""
	import requests
	import zipfile
	from pathlib import Path
	import sys
	import os

	# Add the parent directory to Python path so we can import config
	sys.path.insert(0, str(Path(__file__).parent.parent))

	from config import DATA_DIR

	def download_sample_data():
	"""Download a small sample dataset of documents."""

	# Sample documents (you can replace with your own dataset)
	sample_docs = [
	{
	"name": "machine_learning_intro.md",
	"content": """# Machine Learning Introduction
	Machine learning is a subset of artificial intelligence that enables systems
	to learn and improve from experience without being explicitly programmed.

	## Types of Machine Learning
	1. Supervised Learning
	2. Unsupervised Learning
	3. Reinforcement Learning

	## Applications
	- Natural Language Processing
	- Computer Vision
	- Recommendation Systems
	- Predictive Analytics"""
	},
	{
	"name": "fastapi_guide.md",
	"content": """# FastAPI Guide
	FastAPI is a modern, fast web framework for building APIs with Python 3.7+.

	## Key Features
	- Fast: Very high performance
	- Easy: Easy to use and learn
	- Standards-based: Based on OpenAPI and JSON Schema

	## Installation
	`ash
	pip install fastapi uvicorn
	Basic Example
	python
	from fastapi import FastAPI

	app = FastAPI()

	@app.get("/")
	def read_root():
	return {"Hello": "World"}
	`"""
	},
	{
	"name": "python_basics.txt",
	"content": """Python Programming Basics

	Python is an interpreted, high-level programming language known for its readability.
	Key features include dynamic typing, automatic memory management, and support for multiple programming paradigms.

	Data Types:
	- Integers, Floats
	- Strings
	- Lists, Tuples
	- Dictionaries
	- Sets

	Control Structures:
	- if/else statements
	- for loops
	- while loops
	- try/except blocks"""
	},
	{
	"name": "database_concepts.md",
	"content": """# Database Concepts

	## SQL vs NoSQL
	SQL databases are relational, NoSQL databases are non-relational.

	## Common Databases
	1. PostgreSQL
	2. MySQL
	3. MongoDB
	4. Redis

	## Indexing
	Indexes improve query performance but slow down write operations.
	Common index types: B-tree, Hash, Bitmap."""
	},
	{
	"name": "web_development.txt",
	"content": """Web Development Overview

	Frontend: HTML, CSS, JavaScript
	Backend: Python, Node.js, Java, Go
	Databases: SQL, NoSQL
	DevOps: Docker, Kubernetes, CI/CD

	Frameworks:
	- React, Vue, Angular (Frontend)
	- Django, Flask, FastAPI (Python)
	- Express.js (Node.js)
	- Spring Boot (Java)"""
	}
	]

	print(f"Creating sample documents in {DATA_DIR}...")
	DATA_DIR.mkdir(exist_ok=True)

	for doc in sample_docs:
	file_path = DATA_DIR / doc["name"]
	with open(file_path, 'w', encoding='utf-8') as f:
	f.write(doc["content"])
	print(f" Created: {file_path}")

	# Create additional text files
	topics = ["ai", "databases", "web", "devops", "cloud", "security"]
	for i, topic in enumerate(topics):
	file_path = DATA_DIR / f"{topic}_overview.txt"
	content = f"# {topic.title()} Overview\n\n"
	content += f"This document discusses key concepts in {topic}.\n\n"
	content += "## Key Concepts\n"

	for j in range(1, 6):
	content += f"{j}. Important aspect {j} of {topic}\n"
	content += f" - Detail {j}a about this aspect\n"
	content += f" - Detail {j}b about this aspect\n"
	content += f" - Detail {j}c about this aspect\n\n"

	content += "## Applications\n"
	content += f"- Application 1 of {topic}\n"
	content += f"- Application 2 of {topic}\n"
	content += f"- Application 3 of {topic}\n"

	with open(file_path, 'w', encoding='utf-8') as f:
	f.write(content)
	print(f" Created: {file_path}")

	print(f"\nCreated {len(sample_docs) + len(topics)} sample documents in {DATA_DIR}")
	print("You can add your own documents to the data/ directory")

	if __name__ == "__main__":
	download_sample_data()