|
|
|
|
|
""" |
|
|
Download sample documents for testing. |
|
|
""" |
|
|
import requests |
|
|
import zipfile |
|
|
from pathlib import Path |
|
|
import sys |
|
|
import os |
|
|
|
|
|
|
|
|
sys.path.insert(0, str(Path(__file__).parent.parent)) |
|
|
|
|
|
from config import DATA_DIR |
|
|
|
|
|
def download_sample_data(): |
|
|
"""Download a small sample dataset of documents.""" |
|
|
|
|
|
|
|
|
sample_docs = [ |
|
|
{ |
|
|
"name": "machine_learning_intro.md", |
|
|
"content": """# Machine Learning Introduction |
|
|
Machine learning is a subset of artificial intelligence that enables systems |
|
|
to learn and improve from experience without being explicitly programmed. |
|
|
|
|
|
## Types of Machine Learning |
|
|
1. Supervised Learning |
|
|
2. Unsupervised Learning |
|
|
3. Reinforcement Learning |
|
|
|
|
|
## Applications |
|
|
- Natural Language Processing |
|
|
- Computer Vision |
|
|
- Recommendation Systems |
|
|
- Predictive Analytics""" |
|
|
}, |
|
|
{ |
|
|
"name": "fastapi_guide.md", |
|
|
"content": """# FastAPI Guide |
|
|
FastAPI is a modern, fast web framework for building APIs with Python 3.7+. |
|
|
|
|
|
## Key Features |
|
|
- Fast: Very high performance |
|
|
- Easy: Easy to use and learn |
|
|
- Standards-based: Based on OpenAPI and JSON Schema |
|
|
|
|
|
## Installation |
|
|
`ash |
|
|
pip install fastapi uvicorn |
|
|
Basic Example |
|
|
python |
|
|
from fastapi import FastAPI |
|
|
|
|
|
app = FastAPI() |
|
|
|
|
|
@app.get("/") |
|
|
def read_root(): |
|
|
return {"Hello": "World"} |
|
|
`""" |
|
|
}, |
|
|
{ |
|
|
"name": "python_basics.txt", |
|
|
"content": """Python Programming Basics |
|
|
|
|
|
Python is an interpreted, high-level programming language known for its readability. |
|
|
Key features include dynamic typing, automatic memory management, and support for multiple programming paradigms. |
|
|
|
|
|
Data Types: |
|
|
- Integers, Floats |
|
|
- Strings |
|
|
- Lists, Tuples |
|
|
- Dictionaries |
|
|
- Sets |
|
|
|
|
|
Control Structures: |
|
|
- if/else statements |
|
|
- for loops |
|
|
- while loops |
|
|
- try/except blocks""" |
|
|
}, |
|
|
{ |
|
|
"name": "database_concepts.md", |
|
|
"content": """# Database Concepts |
|
|
|
|
|
## SQL vs NoSQL |
|
|
SQL databases are relational, NoSQL databases are non-relational. |
|
|
|
|
|
## Common Databases |
|
|
1. PostgreSQL |
|
|
2. MySQL |
|
|
3. MongoDB |
|
|
4. Redis |
|
|
|
|
|
## Indexing |
|
|
Indexes improve query performance but slow down write operations. |
|
|
Common index types: B-tree, Hash, Bitmap.""" |
|
|
}, |
|
|
{ |
|
|
"name": "web_development.txt", |
|
|
"content": """Web Development Overview |
|
|
|
|
|
Frontend: HTML, CSS, JavaScript |
|
|
Backend: Python, Node.js, Java, Go |
|
|
Databases: SQL, NoSQL |
|
|
DevOps: Docker, Kubernetes, CI/CD |
|
|
|
|
|
Frameworks: |
|
|
- React, Vue, Angular (Frontend) |
|
|
- Django, Flask, FastAPI (Python) |
|
|
- Express.js (Node.js) |
|
|
- Spring Boot (Java)""" |
|
|
} |
|
|
] |
|
|
|
|
|
print(f"Creating sample documents in {DATA_DIR}...") |
|
|
DATA_DIR.mkdir(exist_ok=True) |
|
|
|
|
|
for doc in sample_docs: |
|
|
file_path = DATA_DIR / doc["name"] |
|
|
with open(file_path, 'w', encoding='utf-8') as f: |
|
|
f.write(doc["content"]) |
|
|
print(f" Created: {file_path}") |
|
|
|
|
|
|
|
|
topics = ["ai", "databases", "web", "devops", "cloud", "security"] |
|
|
for i, topic in enumerate(topics): |
|
|
file_path = DATA_DIR / f"{topic}_overview.txt" |
|
|
content = f"# {topic.title()} Overview\n\n" |
|
|
content += f"This document discusses key concepts in {topic}.\n\n" |
|
|
content += "## Key Concepts\n" |
|
|
|
|
|
for j in range(1, 6): |
|
|
content += f"{j}. Important aspect {j} of {topic}\n" |
|
|
content += f" - Detail {j}a about this aspect\n" |
|
|
content += f" - Detail {j}b about this aspect\n" |
|
|
content += f" - Detail {j}c about this aspect\n\n" |
|
|
|
|
|
content += "## Applications\n" |
|
|
content += f"- Application 1 of {topic}\n" |
|
|
content += f"- Application 2 of {topic}\n" |
|
|
content += f"- Application 3 of {topic}\n" |
|
|
|
|
|
with open(file_path, 'w', encoding='utf-8') as f: |
|
|
f.write(content) |
|
|
print(f" Created: {file_path}") |
|
|
|
|
|
print(f"\nCreated {len(sample_docs) + len(topics)} sample documents in {DATA_DIR}") |
|
|
print("You can add your own documents to the data/ directory") |
|
|
|
|
|
if __name__ == "__main__": |
|
|
download_sample_data() |
|
|
|