File size: 4,229 Bytes
04ab625 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 |
#!/usr/bin/env python3
"""
Download sample documents for testing.
"""
import requests
import zipfile
from pathlib import Path
import sys
import os
# Add the parent directory to Python path so we can import config
sys.path.insert(0, str(Path(__file__).parent.parent))
from config import DATA_DIR
def download_sample_data():
"""Download a small sample dataset of documents."""
# Sample documents (you can replace with your own dataset)
sample_docs = [
{
"name": "machine_learning_intro.md",
"content": """# Machine Learning Introduction
Machine learning is a subset of artificial intelligence that enables systems
to learn and improve from experience without being explicitly programmed.
## Types of Machine Learning
1. Supervised Learning
2. Unsupervised Learning
3. Reinforcement Learning
## Applications
- Natural Language Processing
- Computer Vision
- Recommendation Systems
- Predictive Analytics"""
},
{
"name": "fastapi_guide.md",
"content": """# FastAPI Guide
FastAPI is a modern, fast web framework for building APIs with Python 3.7+.
## Key Features
- Fast: Very high performance
- Easy: Easy to use and learn
- Standards-based: Based on OpenAPI and JSON Schema
## Installation
`ash
pip install fastapi uvicorn
Basic Example
python
from fastapi import FastAPI
app = FastAPI()
@app.get("/")
def read_root():
return {"Hello": "World"}
`"""
},
{
"name": "python_basics.txt",
"content": """Python Programming Basics
Python is an interpreted, high-level programming language known for its readability.
Key features include dynamic typing, automatic memory management, and support for multiple programming paradigms.
Data Types:
- Integers, Floats
- Strings
- Lists, Tuples
- Dictionaries
- Sets
Control Structures:
- if/else statements
- for loops
- while loops
- try/except blocks"""
},
{
"name": "database_concepts.md",
"content": """# Database Concepts
## SQL vs NoSQL
SQL databases are relational, NoSQL databases are non-relational.
## Common Databases
1. PostgreSQL
2. MySQL
3. MongoDB
4. Redis
## Indexing
Indexes improve query performance but slow down write operations.
Common index types: B-tree, Hash, Bitmap."""
},
{
"name": "web_development.txt",
"content": """Web Development Overview
Frontend: HTML, CSS, JavaScript
Backend: Python, Node.js, Java, Go
Databases: SQL, NoSQL
DevOps: Docker, Kubernetes, CI/CD
Frameworks:
- React, Vue, Angular (Frontend)
- Django, Flask, FastAPI (Python)
- Express.js (Node.js)
- Spring Boot (Java)"""
}
]
print(f"Creating sample documents in {DATA_DIR}...")
DATA_DIR.mkdir(exist_ok=True)
for doc in sample_docs:
file_path = DATA_DIR / doc["name"]
with open(file_path, 'w', encoding='utf-8') as f:
f.write(doc["content"])
print(f" Created: {file_path}")
# Create additional text files
topics = ["ai", "databases", "web", "devops", "cloud", "security"]
for i, topic in enumerate(topics):
file_path = DATA_DIR / f"{topic}_overview.txt"
content = f"# {topic.title()} Overview\n\n"
content += f"This document discusses key concepts in {topic}.\n\n"
content += "## Key Concepts\n"
for j in range(1, 6):
content += f"{j}. Important aspect {j} of {topic}\n"
content += f" - Detail {j}a about this aspect\n"
content += f" - Detail {j}b about this aspect\n"
content += f" - Detail {j}c about this aspect\n\n"
content += "## Applications\n"
content += f"- Application 1 of {topic}\n"
content += f"- Application 2 of {topic}\n"
content += f"- Application 3 of {topic}\n"
with open(file_path, 'w', encoding='utf-8') as f:
f.write(content)
print(f" Created: {file_path}")
print(f"\nCreated {len(sample_docs) + len(topics)} sample documents in {DATA_DIR}")
print("You can add your own documents to the data/ directory")
if __name__ == "__main__":
download_sample_data()
|