#!/usr/bin/env python3 """ Download sample documents for testing. """ import requests import zipfile from pathlib import Path import sys import os # Add the parent directory to Python path so we can import config sys.path.insert(0, str(Path(__file__).parent.parent)) from config import DATA_DIR def download_sample_data(): """Download a small sample dataset of documents.""" # Sample documents (you can replace with your own dataset) sample_docs = [ { "name": "machine_learning_intro.md", "content": """# Machine Learning Introduction Machine learning is a subset of artificial intelligence that enables systems to learn and improve from experience without being explicitly programmed. ## Types of Machine Learning 1. Supervised Learning 2. Unsupervised Learning 3. Reinforcement Learning ## Applications - Natural Language Processing - Computer Vision - Recommendation Systems - Predictive Analytics""" }, { "name": "fastapi_guide.md", "content": """# FastAPI Guide FastAPI is a modern, fast web framework for building APIs with Python 3.7+. ## Key Features - Fast: Very high performance - Easy: Easy to use and learn - Standards-based: Based on OpenAPI and JSON Schema ## Installation `ash pip install fastapi uvicorn Basic Example python from fastapi import FastAPI app = FastAPI() @app.get("/") def read_root(): return {"Hello": "World"} `""" }, { "name": "python_basics.txt", "content": """Python Programming Basics Python is an interpreted, high-level programming language known for its readability. Key features include dynamic typing, automatic memory management, and support for multiple programming paradigms. Data Types: - Integers, Floats - Strings - Lists, Tuples - Dictionaries - Sets Control Structures: - if/else statements - for loops - while loops - try/except blocks""" }, { "name": "database_concepts.md", "content": """# Database Concepts ## SQL vs NoSQL SQL databases are relational, NoSQL databases are non-relational. ## Common Databases 1. PostgreSQL 2. MySQL 3. MongoDB 4. Redis ## Indexing Indexes improve query performance but slow down write operations. Common index types: B-tree, Hash, Bitmap.""" }, { "name": "web_development.txt", "content": """Web Development Overview Frontend: HTML, CSS, JavaScript Backend: Python, Node.js, Java, Go Databases: SQL, NoSQL DevOps: Docker, Kubernetes, CI/CD Frameworks: - React, Vue, Angular (Frontend) - Django, Flask, FastAPI (Python) - Express.js (Node.js) - Spring Boot (Java)""" } ] print(f"Creating sample documents in {DATA_DIR}...") DATA_DIR.mkdir(exist_ok=True) for doc in sample_docs: file_path = DATA_DIR / doc["name"] with open(file_path, 'w', encoding='utf-8') as f: f.write(doc["content"]) print(f" Created: {file_path}") # Create additional text files topics = ["ai", "databases", "web", "devops", "cloud", "security"] for i, topic in enumerate(topics): file_path = DATA_DIR / f"{topic}_overview.txt" content = f"# {topic.title()} Overview\n\n" content += f"This document discusses key concepts in {topic}.\n\n" content += "## Key Concepts\n" for j in range(1, 6): content += f"{j}. Important aspect {j} of {topic}\n" content += f" - Detail {j}a about this aspect\n" content += f" - Detail {j}b about this aspect\n" content += f" - Detail {j}c about this aspect\n\n" content += "## Applications\n" content += f"- Application 1 of {topic}\n" content += f"- Application 2 of {topic}\n" content += f"- Application 3 of {topic}\n" with open(file_path, 'w', encoding='utf-8') as f: f.write(content) print(f" Created: {file_path}") print(f"\nCreated {len(sample_docs) + len(topics)} sample documents in {DATA_DIR}") print("You can add your own documents to the data/ directory") if __name__ == "__main__": download_sample_data()