rag-latency-optimization / scripts /download_sample_data.py
Ariyan-Pro's picture
Deploy RAG Latency Optimization v1.0
04ab625
#!/usr/bin/env python3
"""
Download sample documents for testing.
"""
import requests
import zipfile
from pathlib import Path
import sys
import os
# Add the parent directory to Python path so we can import config
sys.path.insert(0, str(Path(__file__).parent.parent))
from config import DATA_DIR
def download_sample_data():
"""Download a small sample dataset of documents."""
# Sample documents (you can replace with your own dataset)
sample_docs = [
{
"name": "machine_learning_intro.md",
"content": """# Machine Learning Introduction
Machine learning is a subset of artificial intelligence that enables systems
to learn and improve from experience without being explicitly programmed.
## Types of Machine Learning
1. Supervised Learning
2. Unsupervised Learning
3. Reinforcement Learning
## Applications
- Natural Language Processing
- Computer Vision
- Recommendation Systems
- Predictive Analytics"""
},
{
"name": "fastapi_guide.md",
"content": """# FastAPI Guide
FastAPI is a modern, fast web framework for building APIs with Python 3.7+.
## Key Features
- Fast: Very high performance
- Easy: Easy to use and learn
- Standards-based: Based on OpenAPI and JSON Schema
## Installation
`ash
pip install fastapi uvicorn
Basic Example
python
from fastapi import FastAPI
app = FastAPI()
@app.get("/")
def read_root():
return {"Hello": "World"}
`"""
},
{
"name": "python_basics.txt",
"content": """Python Programming Basics
Python is an interpreted, high-level programming language known for its readability.
Key features include dynamic typing, automatic memory management, and support for multiple programming paradigms.
Data Types:
- Integers, Floats
- Strings
- Lists, Tuples
- Dictionaries
- Sets
Control Structures:
- if/else statements
- for loops
- while loops
- try/except blocks"""
},
{
"name": "database_concepts.md",
"content": """# Database Concepts
## SQL vs NoSQL
SQL databases are relational, NoSQL databases are non-relational.
## Common Databases
1. PostgreSQL
2. MySQL
3. MongoDB
4. Redis
## Indexing
Indexes improve query performance but slow down write operations.
Common index types: B-tree, Hash, Bitmap."""
},
{
"name": "web_development.txt",
"content": """Web Development Overview
Frontend: HTML, CSS, JavaScript
Backend: Python, Node.js, Java, Go
Databases: SQL, NoSQL
DevOps: Docker, Kubernetes, CI/CD
Frameworks:
- React, Vue, Angular (Frontend)
- Django, Flask, FastAPI (Python)
- Express.js (Node.js)
- Spring Boot (Java)"""
}
]
print(f"Creating sample documents in {DATA_DIR}...")
DATA_DIR.mkdir(exist_ok=True)
for doc in sample_docs:
file_path = DATA_DIR / doc["name"]
with open(file_path, 'w', encoding='utf-8') as f:
f.write(doc["content"])
print(f" Created: {file_path}")
# Create additional text files
topics = ["ai", "databases", "web", "devops", "cloud", "security"]
for i, topic in enumerate(topics):
file_path = DATA_DIR / f"{topic}_overview.txt"
content = f"# {topic.title()} Overview\n\n"
content += f"This document discusses key concepts in {topic}.\n\n"
content += "## Key Concepts\n"
for j in range(1, 6):
content += f"{j}. Important aspect {j} of {topic}\n"
content += f" - Detail {j}a about this aspect\n"
content += f" - Detail {j}b about this aspect\n"
content += f" - Detail {j}c about this aspect\n\n"
content += "## Applications\n"
content += f"- Application 1 of {topic}\n"
content += f"- Application 2 of {topic}\n"
content += f"- Application 3 of {topic}\n"
with open(file_path, 'w', encoding='utf-8') as f:
f.write(content)
print(f" Created: {file_path}")
print(f"\nCreated {len(sample_docs) + len(topics)} sample documents in {DATA_DIR}")
print("You can add your own documents to the data/ directory")
if __name__ == "__main__":
download_sample_data()