diff --git a/.env.example b/.env.example deleted file mode 100644 index 55f677de2c23e27d86c658fdf2953bd5c7348632..0000000000000000000000000000000000000000 --- a/.env.example +++ /dev/null @@ -1,99 +0,0 @@ -# ======================================== -# YouTube Study Notes AI - Environment Configuration -# ======================================== - -# ---------------------------------------- -# Google Gemini API -# ---------------------------------------- -# Get your API key from: https://makersuite.google.com/app/apikey -GOOGLE_API_KEY=AIzaSyD_wHludyQbdlNk9rTvitwV1lWkJPqoYbE - -# ---------------------------------------- -# Database Configuration (Supabase PostgreSQL) -# ---------------------------------------- -# Format: postgresql+asyncpg://[user]:[password]@[host]:[port]/[database] -# -# For Supabase: -# 1. Go to your Supabase project dashboard -# 2. Navigate to Settings > Database -# 3. Under "Connection string", select "URI" mode -# 4. Copy the connection string and replace "postgresql://" with "postgresql+asyncpg://" -# -# Example: -# DATABASE_URL=postgresql+asyncpg://postgres.[your-project-ref]:[your-password]@aws-0-us-east-1.pooler.supabase.com:5432/postgres -#6SsbAvgkhJTEl2CS -# postgresql://postgres:[YOUR-PASSWORD]@db.jqjjanimclxplgvoiwyk.supabase.co:5432/postgres -# DATABASE_URL="postgresql+asyncpg://postgres.jqjjanimclxplgvoiwyk:AliProject2026@aws-1-eu-west-1.pooler.supabase.com:6543/postgres?ssl=require" - -DATABASE_URL="postgresql+asyncpg://postgres.mupnxzgkcdmwhojmhxdh:6SsbAvgkhJTEl2CS@aws-1-eu-west-1.pooler.supabase.com:6543/postgres?ssl=require" - -# ---------------------------------------- -# Authentication & Security -# ---------------------------------------- -# Generate a secure secret key with: python -c "import secrets; print(secrets.token_urlsafe(32))" -SECRET_KEY=5j55fk-ljD_Urkih0dvZ11WQ0iTYwwjPyQ7N7Un0eDY -ACCESS_TOKEN_EXPIRE_MINUTES=60 -ALGORITHM=HS256 - -# ---------------------------------------- -# API Server Configuration -# ---------------------------------------- - -# 192.168.1.101 -API_HOST=0.0.0.0 -API_PORT=8000 - -# ---------------------------------------- -# Whisper Model Configuration -# ---------------------------------------- -# Options: tiny, base, small, medium, large -# Larger models are more accurate but slower -WHISPER_MODEL_SIZE=base - -# ---------------------------------------- -# Processing Limits -# ---------------------------------------- -# Maximum video duration in seconds (2 hours = 7200) -MAX_VIDEO_DURATION=7200 - -# ---------------------------------------- -# Output Configuration -# ---------------------------------------- -OUTPUT_FORMAT=markdown -OUTPUT_DIR=outputs - -# ---------------------------------------- -# Logging -# ---------------------------------------- -LOG_LEVEL=INFO -LOG_FILE=app.log - -# ---------------------------------------- -# IMPORTANT NOTES FOR SUPABASE SETUP -# ---------------------------------------- -# -# After creating your Supabase project: -# -# 1. DATABASE_URL Setup: -# - Go to Project Settings > Database -# - Find "Connection string" section -# - Select "URI" mode -# - Copy the connection string -# - Replace "postgresql://" with "postgresql+asyncpg://" -# - Replace [YOUR-PASSWORD] with your actual database password -# -# 2. Security: -# - NEVER commit this .env file to version control -# - Add .env to your .gitignore file -# - Generate a strong SECRET_KEY for production -# -# 3. Connection Pooling: -# - Supabase provides connection pooling by default -# - Use the "pooler" connection string for better performance -# - Example: aws-0-us-east-1.pooler.supabase.com -# -# 4. SSL Mode: -# - Supabase requires SSL connections (enabled by default with asyncpg) -# - If you encounter SSL errors, you can disable verification (NOT RECOMMENDED for production): -# DATABASE_URL=postgresql+asyncpg://...?ssl=require -# diff --git a/.gitattributes b/.gitattributes deleted file mode 100644 index a6344aac8c09253b3b630fb776ae94478aa0275b..0000000000000000000000000000000000000000 --- a/.gitattributes +++ /dev/null @@ -1,35 +0,0 @@ -*.7z filter=lfs diff=lfs merge=lfs -text -*.arrow filter=lfs diff=lfs merge=lfs -text -*.bin filter=lfs diff=lfs merge=lfs -text -*.bz2 filter=lfs diff=lfs merge=lfs -text -*.ckpt filter=lfs diff=lfs merge=lfs -text -*.ftz filter=lfs diff=lfs merge=lfs -text -*.gz filter=lfs diff=lfs merge=lfs -text -*.h5 filter=lfs diff=lfs merge=lfs -text -*.joblib filter=lfs diff=lfs merge=lfs -text -*.lfs.* filter=lfs diff=lfs merge=lfs -text -*.mlmodel filter=lfs diff=lfs merge=lfs -text -*.model filter=lfs diff=lfs merge=lfs -text -*.msgpack filter=lfs diff=lfs merge=lfs -text -*.npy filter=lfs diff=lfs merge=lfs -text -*.npz filter=lfs diff=lfs merge=lfs -text -*.onnx filter=lfs diff=lfs merge=lfs -text -*.ot filter=lfs diff=lfs merge=lfs -text -*.parquet filter=lfs diff=lfs merge=lfs -text -*.pb filter=lfs diff=lfs merge=lfs -text -*.pickle filter=lfs diff=lfs merge=lfs -text -*.pkl filter=lfs diff=lfs merge=lfs -text -*.pt filter=lfs diff=lfs merge=lfs -text -*.pth filter=lfs diff=lfs merge=lfs -text -*.rar filter=lfs diff=lfs merge=lfs -text -*.safetensors filter=lfs diff=lfs merge=lfs -text -saved_model/**/* filter=lfs diff=lfs merge=lfs -text -*.tar.* filter=lfs diff=lfs merge=lfs -text -*.tar filter=lfs diff=lfs merge=lfs -text -*.tflite filter=lfs diff=lfs merge=lfs -text -*.tgz filter=lfs diff=lfs merge=lfs -text -*.wasm filter=lfs diff=lfs merge=lfs -text -*.xz filter=lfs diff=lfs merge=lfs -text -*.zip filter=lfs diff=lfs merge=lfs -text -*.zst filter=lfs diff=lfs merge=lfs -text -*tfevents* filter=lfs diff=lfs merge=lfs -text diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..154ff6235d764eff1b90e406bbb13f97a2fc7686 --- /dev/null +++ b/.gitignore @@ -0,0 +1,40 @@ +# Ignore environment file with secrets +.env +firebase-service-account.json +serviceAccountKey.json + +# Ignore Python cache +__pycache__/ +*.py[cod] +*$py.class +*.so +.Python + +# Ignore virtual environments +venv/ +env/ +ENV/ + +# Ignore temporary files +temp/* +!temp/.gitkeep + +# Ignore output files +outputs/* +!outputs/.gitkeep + +# Ignore logs +*.log + +# Ignore OS files +.DS_Store +Thumbs.db + +# Ignore IDE files +.vscode/ +.idea/ +*.swp +*.swo + +# Ignore downloaded models (Whisper caches) +~/.cache/whisper/ diff --git a/.python-version b/.python-version new file mode 100644 index 0000000000000000000000000000000000000000..6324d401a069f4020efcf0ff07442724b52f47c2 --- /dev/null +++ b/.python-version @@ -0,0 +1 @@ +3.14 diff --git a/Dockerfile b/Dockerfile index 2774dd8e30526bc18f9b3b7608b4ba3e8a1b0aa4..55b27fb11c697968084a2a7288e6783b05b4b353 100644 --- a/Dockerfile +++ b/Dockerfile @@ -15,6 +15,7 @@ RUN apt-get update && apt-get install -y \ WORKDIR /app # Install Python dependencies +# We copy requirements first to leverage Docker cache COPY requirements.txt . RUN pip install --no-cache-dir -r requirements.txt @@ -24,5 +25,5 @@ COPY . . # Hugging Face Spaces expects the app to run on port 7860 EXPOSE 7860 -# Run the application -CMD ["python", "app.py"] +# Run the application using the entry point we created +CMD ["python", "app.py","server"] diff --git a/README.md b/README.md deleted file mode 100644 index 874dbee4ffc90feb82826344454cd89fcb7c4e28..0000000000000000000000000000000000000000 --- a/README.md +++ /dev/null @@ -1,16 +0,0 @@ ---- -title: AIdea Note Generator -emoji: πŸ“ -colorFrom: blue -colorTo: purple -sdk: docker -app_file: app.py -pinned: false ---- - -# YouTube Study Notes AI - -![Python](https://img.shields.io/badge/python-3.8+-blue.svg) -![License](https://img.shields.io/badge/license-MIT-green.svg) - -An intelligent AI system that automatically generates structured study notes from YouTube educational videos using speech recognition and large language models. diff --git a/check_models.py b/check_models.py new file mode 100644 index 0000000000000000000000000000000000000000..9bbf93541f9c6b4a1a4be7c9691737f37409c942 --- /dev/null +++ b/check_models.py @@ -0,0 +1,24 @@ +import google.generativeai as genai +import os +from dotenv import load_dotenv + +load_dotenv() +api_key = os.getenv("GOOGLE_API_KEY") + +if not api_key: + print("❌Can't find the key.env") +else: + genai.configure(api_key=api_key) + print("πŸ” Searching for available models...") + try: + found = False + for m in genai.list_models(): + if "generateContent" in m.supported_generation_methods: + print(f"βœ… Available: {m.name}") + found = True + if not found: + print("⚠️ No models found that support text generation.") + except Exception as e: + print(f"❌ An error occurred: {e}") + +input("\nPress Enter to exit...") diff --git a/examples/sample_output.md b/examples/sample_output.md new file mode 100644 index 0000000000000000000000000000000000000000..2f2dcb4e1790132761a11c6fdd444a0cf44e5e74 --- /dev/null +++ b/examples/sample_output.md @@ -0,0 +1,105 @@ +# Introduction to Neural Networks + +**Source:** [https://youtube.com/watch?v=example](https://youtube.com/watch?v=example) +**Duration:** 18:45 +**Generated:** AI Study Notes + +--- + +# Introduction to Neural Networks + +## What is a Neural Network? + +- **Artificial Neural Network (ANN)**: A computing system inspired by biological neural networks +- Composed of interconnected nodes (neurons) organized in layers +- learns to perform tasks by considering examples without task-specific programming +- **Key components**: + - Input layer: Receives initial data + - Hidden layers: Process information + - Output layer: Produces final results + +## Basic Architecture + +- **Neurons**: Basic computational units that receive input and produce output +- **Weights**: Parameters that determine the strength of connections +- **Bias**: Additional parameter to adjust the output +- **Activation Function**: Introduces non-linearity to the network + - Common functions: ReLU, Sigmoid, Tanh + +## How Neural Networks Learn + +- **Training Process**: Iterative adjustment of weights and biases +- **Forward Propagation**: + - Input data flows through the network + - Each neuron applies weights and activation function + - Output is computed at the end + +- **Backpropagation**: + - Compare output with expected result + - Calculate error/loss + - Propagate error backwards through network + - Update weights using gradient descent + +## Training Components + +- **Loss Function**: Measures how far predictions are from actual values + - Mean Squared Error (MSE) for regression + - Cross-Entropy for classification + +- **Optimizer**: Algorithm to update weights + - **Gradient Descent**: Basic optimization method + - **Adam**: Adaptive learning rate optimizer (popular choice) + - **SGD**: Stochastic Gradient Descent + +## Common Applications + +- **Image Recognition**: Identifying objects in photos +- **Natural Language Processing**: Understanding and generating text +- **Speech Recognition**: Converting audio to text +- **Game Playing**: Learning to play games through reinforcement +- **Recommendation Systems**: Suggesting content based on preferences + +## Deep Learning + +- **Definition**: Neural networks with multiple hidden layers (deep networks) +- More layers enable learning of hierarchical features +- **Examples**: + - Convolutional Neural Networks (CNN): For image processing + - Recurrent Neural Networks (RNN): For sequential data + - Transformers: For language understanding + +## Key Concepts + +- **Overfitting**: Model learns training data too well, poor generalization + - Solution: Regularization, dropout, more data + +- **Underfitting**: Model too simple to capture patterns + - Solution: More complex model, more features + +- **Hyperparameters**: Settings configured before training + - Learning rate + - Number of layers + - Number of neurons per layer + - Batch size + +## Training Best Practices + +- Start with a simple architecture +- Use appropriate activation functions +- Normalize input data +- Split data into training, validation, and test sets +- Monitor training and validation loss +- Use regularization techniques to prevent overfitting +- Experiment with different optimizers and learning rates + +## Challenges + +- Requires large amounts of data +- Computationally expensive (GPU recommended) +- Can be difficult to interpret ("black box") +- Choosing right architecture requires experience +- Risk of overfitting with complex models + +## Summary + +Neural networks are powerful machine learning models that can learn complex patterns from data. They consist of layers of interconnected neurons that process information through weighted connections. Through the process of forward propagation and backpropagation, the network learns to minimize errors and make accurate predictions. While they require significant computational resources and data, they have revolutionized fields like computer vision, natural language processing, and many other domains of artificial intelligence. diff --git a/get_ffmpeg.py b/get_ffmpeg.py new file mode 100644 index 0000000000000000000000000000000000000000..43b47d5c3fdc0ebabed13dba0420257b3f86b0c1 --- /dev/null +++ b/get_ffmpeg.py @@ -0,0 +1,48 @@ +import os +import zipfile +import urllib.request +import sys + +# Direct download URL +url = "https://www.gyan.dev/ffmpeg/builds/ffmpeg-release-essentials.zip" +zip_filename = "ffmpeg_temp.zip" + +def progress_bar(block_num, block_size, total_size): + downloaded = block_num * block_size + if total_size > 0: + percent = downloaded * 100 / total_size + sys.stdout.write(f"\r⏳ Downloading: {percent:.1f}% ({downloaded / (1024*1024):.1f} MB)") + sys.stdout.flush() + +print("πŸš€ Starting FFmpeg download (approx 130MB)...") + +try: + # 1. Download with progress bar + urllib.request.urlretrieve(url, zip_filename, progress_bar) + print("\n\nπŸ“¦ Download complete! Extracting files...") + + # 2. Extract specific files + with zipfile.ZipFile(zip_filename, 'r') as z: + count = 0 + for filename in z.namelist(): + if filename.endswith("bin/ffmpeg.exe") or filename.endswith("bin/ffprobe.exe"): + target_name = os.path.basename(filename) + print(f" Extracting -> {target_name}") + with open(target_name, "wb") as f: + f.write(z.read(filename)) + count += 1 + + # 3. Cleanup + if os.path.exists(zip_filename): + os.remove(zip_filename) + + if count == 2: + print("\nβœ… Success! FFmpeg installed successfully.") + print("You can now run: python run.py server") + else: + print("\n⚠️ Warning: Could not find ffmpeg files in the zip.") + +except Exception as e: + print(f"\n❌ Error occurred: {e}") + +input("\nPress Enter to exit...") \ No newline at end of file diff --git a/outputs/.gitkeep b/outputs/.gitkeep new file mode 100644 index 0000000000000000000000000000000000000000..eb289f86b8b0ec481f6ab981b976bc606824efe8 --- /dev/null +++ b/outputs/.gitkeep @@ -0,0 +1,2 @@ +# Outputs directory +# Generated study notes will be saved here diff --git a/packages.txt b/packages.txt deleted file mode 100644 index 20645e641240cb419f5fc66c14c1447e91daf669..0000000000000000000000000000000000000000 --- a/packages.txt +++ /dev/null @@ -1 +0,0 @@ -ffmpeg diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000000000000000000000000000000000000..8c4b38960947e5359640d32a3ea4bd851d477cd1 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,34 @@ +[project] +name = "program" +version = "0.1.0" +description = "Add your description here" +readme = "README.md" +requires-python = ">=3.14" +dependencies = [ + "aiofiles==23.2.1", + "asyncpg==0.31.0", + "bcrypt==4.1.2", + "email-validator>=2.3.0", + "fastapi==0.109.0", + "google-api-python-client==2.115.0", + "google-genai>=1.2.0", + "google-generativeai==0.3.2", + "greenlet==3.3.1", + "httpx==0.26.0", + "langchain==0.1.0", + "langchain-google-genai==0.0.5", + "openai-whisper==20250625", + "passlib[bcrypt]==1.7.4", + "pydantic-core==2.41.5", + "pydantic-settings==2.1.0", + "pydantic[email]==2.12.5", + "pydub==0.25.1", + "python-dotenv==1.0.0", + "python-jose[cryptography]==3.3.0", + "python-multipart==0.0.6", + "sqlmodel==0.0.14", + "torch>=2.10.0", + "torchaudio>=2.10.0", + "uvicorn[standard]==0.27.0", + "yt-dlp==2024.12.23", +] diff --git a/requirements.txt b/requirements.txt index d89731916f8d1698c4f4946220ee0fdbc5d4ee80..2ee8ba1cc39b390f0b993c0d90961908455f3f6b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -26,3 +26,4 @@ bcrypt==4.1.2 google-api-python-client==2.115.0 pydantic-core==2.41.5 ffmpeg-python +firebase-admin==6.5.0 diff --git a/run.py b/run.py index d2cebf0832b4e48817bf8af86970edafc4e24ca8..b79663614583ab73905bfc0d6a2cc1dea7435501 100644 --- a/run.py +++ b/run.py @@ -3,11 +3,11 @@ Main entry point for YouTube Study Notes AI. Provides CLI interface and server startup. """ +import os import sys import argparse from pathlib import Path -# Import necessary modules for server and middleware from src.utils.logger import setup_logger from src.utils.config import settings diff --git a/src/__pycache__/__init__.cpython-312.pyc b/src/__pycache__/__init__.cpython-312.pyc index c7c965cc900924d59cb7d583f10bf25a47bcba84..2676e3aff80264c22cca1dc74074572c639c60ad 100644 Binary files a/src/__pycache__/__init__.cpython-312.pyc and b/src/__pycache__/__init__.cpython-312.pyc differ diff --git a/src/__pycache__/__init__.cpython-314.pyc b/src/__pycache__/__init__.cpython-314.pyc index eea5e8ed105a5ead32a4472549b728407bf8f645..8eefc491dc1ac87c18486979f1562f8779eba5ff 100644 Binary files a/src/__pycache__/__init__.cpython-314.pyc and b/src/__pycache__/__init__.cpython-314.pyc differ diff --git a/src/ai_modules/README.md b/src/ai_modules/README.md deleted file mode 100644 index 2dcd78c0bb4c4f41165fd7c191eb5d07c4e0a93c..0000000000000000000000000000000000000000 --- a/src/ai_modules/README.md +++ /dev/null @@ -1,137 +0,0 @@ -# AI Modules Overview πŸ€– - -## Overview of AI Modules - -This directory contains all AI-related modules in the project, divided into 4 main components: - -## The Four Modules - -### 1. 🎀 Transcription -**Responsibility:** Convert YouTube videos to written text. - -**Files:** -- `audio_downloader.py` - Download audio from YouTube -- `whisper_transcriber.py` - Convert audio to text using Whisper -- `audio_processor.py` - Process and validate audio file quality - -**Technologies:** OpenAI Whisper, yt-dlp, PyTorch - ---- - -### 2. πŸ“ Summarization -**Responsibility:** Convert text to organized study notes. - -**Files:** -- `note_generator.py` - Generate notes using Gemini -- `schemas.py` - Define data structure -- `segmenter.py` - Split long texts - -**Technologies:** Google Gemini, Pydantic - ---- - -### 3. 🎯 Recommendation -**Responsibility:** Suggest new educational videos to users. - -**Files:** -- `recommender.py` - Search YouTube and analyze interests - -**Technologies:** YouTube Data API v3 - ---- - -### 4. 🏷️ Categorization -**Responsibility:** Automatically categorize notes. - -**Files:** -- `categorizer.py` - Categorize text using AI - -**Technologies:** Google Gemini - ---- - -## Complete Workflow (Full Pipeline) - -```mermaid -graph LR - A[YouTube URL] --> B[Transcription] - B --> C[Summarization] - C --> D[Categorization] - D --> E[Database] - E --> F[Recommendation] - F --> G[New Videos] -``` - -1. **User enters YouTube URL** β†’ Transcription module -2. **Audio is downloaded and converted to text** β†’ Summarization module -3. **Text is summarized and organized** β†’ Categorization module -4. **Summary is categorized** β†’ Database -5. **Based on saved notes** β†’ Recommendation module -6. **New videos are suggested** β†’ User - ---- - -## For Team Members: How to Start? - -### If you're responsible for Transcription: -1. Open `transcription/` directory -2. Read the `README.md` file inside -3. Test the code using the provided examples -4. Develop the proposed features - -### If you're responsible for Summarization: -1. Open `summarization/` directory -2. Read the `README.md` file -3. Try modifying the prompts to improve summary quality -4. Add new features (like translation) - -### If you're responsible for Recommendation: -1. Open `recommendation/` directory -2. Read the `README.md` file -3. Develop caching mechanism to save API quota -4. Improve recommendation accuracy - -### If you're responsible for Categorization: -1. Open `categorization/` directory -2. Read the `README.md` file -3. Add a predefined list of categories -4. Improve the prompt to increase accuracy - ---- - -## General Notes - -### Shared Libraries -All modules use: -- `src.utils.logger` - For logging -- `src.utils.config` - For reading settings from `.env` - -### Testing -To test any module, use: -```bash -cd D:\faculty\Class4\grad\program -python -m pytest tests/ -``` - -### Required Environment -Make sure to install the libraries: -```bash -pip install -r requirements.txt -``` - -### Required Environment Variables -In `.env` file: -``` -GOOGLE_API_KEY=your_google_api_key_here -WHISPER_MODEL_SIZE=base -DATABASE_URL=your_database_url -``` - ---- - -## Team Communication -- If you encounter a problem in a specific module, **open an Issue** on GitHub. -- If you add a new feature, **update the README** file for that module. -- Before committing, make sure the code runs without errors. - -**Good luck to the team! πŸš€** diff --git a/src/ai_modules/__pycache__/__init__.cpython-312.pyc b/src/ai_modules/__pycache__/__init__.cpython-312.pyc deleted file mode 100644 index 2676e3aff80264c22cca1dc74074572c639c60ad..0000000000000000000000000000000000000000 Binary files a/src/ai_modules/__pycache__/__init__.cpython-312.pyc and /dev/null differ diff --git a/src/ai_modules/__pycache__/__init__.cpython-314.pyc b/src/ai_modules/__pycache__/__init__.cpython-314.pyc deleted file mode 100644 index 8eefc491dc1ac87c18486979f1562f8779eba5ff..0000000000000000000000000000000000000000 Binary files a/src/ai_modules/__pycache__/__init__.cpython-314.pyc and /dev/null differ diff --git a/src/ai_modules/categorization/__pycache__/__init__.cpython-312.pyc b/src/ai_modules/categorization/__pycache__/__init__.cpython-312.pyc deleted file mode 100644 index cb82e91ad96961c6d9e3676289dc0382cfa03b32..0000000000000000000000000000000000000000 Binary files a/src/ai_modules/categorization/__pycache__/__init__.cpython-312.pyc and /dev/null differ diff --git a/src/ai_modules/categorization/__pycache__/categorizer.cpython-312.pyc b/src/ai_modules/categorization/__pycache__/categorizer.cpython-312.pyc deleted file mode 100644 index 582dce292776c1e54d0556c0b1d8b63e052040fc..0000000000000000000000000000000000000000 Binary files a/src/ai_modules/categorization/__pycache__/categorizer.cpython-312.pyc and /dev/null differ diff --git a/src/ai_modules/summarization/__pycache__/__init__.cpython-312.pyc b/src/ai_modules/summarization/__pycache__/__init__.cpython-312.pyc deleted file mode 100644 index 6c256c13f30afcabb49128663d3484af91b894fb..0000000000000000000000000000000000000000 Binary files a/src/ai_modules/summarization/__pycache__/__init__.cpython-312.pyc and /dev/null differ diff --git a/src/ai_modules/summarization/__pycache__/__init__.cpython-314.pyc b/src/ai_modules/summarization/__pycache__/__init__.cpython-314.pyc deleted file mode 100644 index c2226d7ed96bdabbd1b47f659734cd3ea8345d93..0000000000000000000000000000000000000000 Binary files a/src/ai_modules/summarization/__pycache__/__init__.cpython-314.pyc and /dev/null differ diff --git a/src/ai_modules/summarization/__pycache__/note_generator.cpython-312.pyc b/src/ai_modules/summarization/__pycache__/note_generator.cpython-312.pyc deleted file mode 100644 index 46f3be0175cf4b3788cdcf39f1b53231acbd2406..0000000000000000000000000000000000000000 Binary files a/src/ai_modules/summarization/__pycache__/note_generator.cpython-312.pyc and /dev/null differ diff --git a/src/ai_modules/summarization/__pycache__/note_generator.cpython-314.pyc b/src/ai_modules/summarization/__pycache__/note_generator.cpython-314.pyc deleted file mode 100644 index 5b833c7f35d02e893f48bca3190a392d949da8ac..0000000000000000000000000000000000000000 Binary files a/src/ai_modules/summarization/__pycache__/note_generator.cpython-314.pyc and /dev/null differ diff --git a/src/ai_modules/summarization/__pycache__/schemas.cpython-312.pyc b/src/ai_modules/summarization/__pycache__/schemas.cpython-312.pyc deleted file mode 100644 index b97d1c5402d1e0d655e0595249cd1943073627b4..0000000000000000000000000000000000000000 Binary files a/src/ai_modules/summarization/__pycache__/schemas.cpython-312.pyc and /dev/null differ diff --git a/src/ai_modules/transcription/__init__.py b/src/ai_modules/transcription/__init__.py deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/src/ai_modules/transcription/__pycache__/__init__.cpython-312.pyc b/src/ai_modules/transcription/__pycache__/__init__.cpython-312.pyc deleted file mode 100644 index 5b5816e7e925f1549065106c14e98a2bd4bb2de8..0000000000000000000000000000000000000000 Binary files a/src/ai_modules/transcription/__pycache__/__init__.cpython-312.pyc and /dev/null differ diff --git a/src/ai_modules/transcription/__pycache__/__init__.cpython-314.pyc b/src/ai_modules/transcription/__pycache__/__init__.cpython-314.pyc deleted file mode 100644 index 0d3e8d5e19be8faddfde3d03269e5a71e0d328aa..0000000000000000000000000000000000000000 Binary files a/src/ai_modules/transcription/__pycache__/__init__.cpython-314.pyc and /dev/null differ diff --git a/src/ai_modules/transcription/__pycache__/audio_downloader.cpython-312.pyc b/src/ai_modules/transcription/__pycache__/audio_downloader.cpython-312.pyc deleted file mode 100644 index 2d11c141d70e7a25d9e1e296c8b12d84b47c39cc..0000000000000000000000000000000000000000 Binary files a/src/ai_modules/transcription/__pycache__/audio_downloader.cpython-312.pyc and /dev/null differ diff --git a/src/ai_modules/transcription/__pycache__/audio_downloader.cpython-314.pyc b/src/ai_modules/transcription/__pycache__/audio_downloader.cpython-314.pyc deleted file mode 100644 index 975cf63ceeb1d36c50d623666eec08c391885e20..0000000000000000000000000000000000000000 Binary files a/src/ai_modules/transcription/__pycache__/audio_downloader.cpython-314.pyc and /dev/null differ diff --git a/src/ai_modules/transcription/__pycache__/whisper_transcriber.cpython-312.pyc b/src/ai_modules/transcription/__pycache__/whisper_transcriber.cpython-312.pyc deleted file mode 100644 index 938062b2a5f6ed4e81628397c376fdffd20504d4..0000000000000000000000000000000000000000 Binary files a/src/ai_modules/transcription/__pycache__/whisper_transcriber.cpython-312.pyc and /dev/null differ diff --git a/src/ai_modules/transcription/__pycache__/whisper_transcriber.cpython-314.pyc b/src/ai_modules/transcription/__pycache__/whisper_transcriber.cpython-314.pyc deleted file mode 100644 index b9fdf14f1dde47ada3d0a2275fbc909f6d27ca8e..0000000000000000000000000000000000000000 Binary files a/src/ai_modules/transcription/__pycache__/whisper_transcriber.cpython-314.pyc and /dev/null differ diff --git a/src/api/__pycache__/main.cpython-312.pyc b/src/api/__pycache__/main.cpython-312.pyc index f204e5543aac6f42564eba78fafddf2acc85572f..afaf4ecdb21bd8ccc56e1792088b58c02d7a6fce 100644 Binary files a/src/api/__pycache__/main.cpython-312.pyc and b/src/api/__pycache__/main.cpython-312.pyc differ diff --git a/src/api/__pycache__/main.cpython-314.pyc b/src/api/__pycache__/main.cpython-314.pyc index cfdcb130b40e14c0634a3c38fd0aabb38c5db55e..5ab8f1be92fdfa942f9edceca059d0abf3af5bd9 100644 Binary files a/src/api/__pycache__/main.cpython-314.pyc and b/src/api/__pycache__/main.cpython-314.pyc differ diff --git a/src/api/auth_routes.py b/src/api/auth_routes.py index 7d4a788c5b66707e652aea318f959dbcf78feed7..dbf7adab26c010c49ee2a9baaee4bfe494a510f4 100644 --- a/src/api/auth_routes.py +++ b/src/api/auth_routes.py @@ -10,7 +10,7 @@ from pydantic import BaseModel, EmailStr, Field from sqlmodel import select from sqlmodel.ext.asyncio.session import AsyncSession -from src.db.database import get_session +from src.db.firebase import get_firebase_db from src.db.models import User from src.auth.security import hash_password, verify_password, create_access_token from src.utils.logger import setup_logger @@ -44,7 +44,7 @@ class SignupRequest(BaseModel): class UserResponse(BaseModel): """Response model for user data (without password).""" - id: int + id: str email: str username: str role: str @@ -55,7 +55,7 @@ class UserResponse(BaseModel): class Config: json_schema_extra = { "example": { - "id": 1, + "id": "abc-123", "email": "student@example.com", "username": "Student123", "role": "user", @@ -85,19 +85,22 @@ class TokenResponse(BaseModel): "/signup", response_model=UserResponse, status_code=status.HTTP_201_CREATED ) async def signup( - signup_data: SignupRequest, session: AsyncSession = Depends(get_session) + signup_data: SignupRequest ): """ - Register a new user. + Register a new user using Firestore. """ - # Check if email or username already exists - statement = select(User).where( - (User.email == signup_data.email) | (User.username == signup_data.username) - ) - result = await session.exec(statement) - existing_user = result.first() + db = get_firebase_db() + if db is None: + raise HTTPException(status_code=500, detail="Firebase not configured") - if existing_user: + # Check if email or username already exists + users_ref = db.collection("users") + + email_check = users_ref.where("email", "==", signup_data.email).limit(1).stream() + username_check = users_ref.where("username", "==", signup_data.username).limit(1).stream() + + if next(email_check, None) or next(username_check, None): raise HTTPException( status_code=status.HTTP_409_CONFLICT, detail="Email or Username already registered", @@ -106,53 +109,63 @@ async def signup( # Create new user with hashed password hashed_password_value = hash_password(signup_data.password) - new_user = User( + user_dict = { + "email": signup_data.email, + "username": signup_data.username, + "password_hash": hashed_password_value, + "role": "user", + "age": signup_data.age, + "gender": signup_data.gender, + "created_at": datetime.utcnow() + } + + _, new_user_ref = users_ref.add(user_dict) + + logger.info(f"New user registered in Firestore: {signup_data.email}") + + return UserResponse( + id=new_user_ref.id, email=signup_data.email, username=signup_data.username, - password_hash=hashed_password_value, role="user", age=signup_data.age, gender=signup_data.gender, - ) - - session.add(new_user) - await session.commit() - await session.refresh(new_user) - - logger.info(f"New user registered: {new_user.email}") - - return UserResponse( - id=new_user.id, - email=new_user.email, - username=new_user.username, - role=new_user.role, - age=new_user.age, - gender=new_user.gender, - created_at=str(new_user.created_at), + created_at=str(user_dict["created_at"]), ) @router.post("/login", response_model=TokenResponse) async def login( - form_data: OAuth2PasswordRequestForm = Depends(), - session: AsyncSession = Depends(get_session), + form_data: OAuth2PasswordRequestForm = Depends() ): """ - Authenticate user and return JWT access token. + Authenticate user and return JWT access token using Firestore. """ + db = get_firebase_db() + if db is None: + raise HTTPException(status_code=500, detail="Firebase not configured") + + users_ref = db.collection("users") + # Find user by username - statement = select(User).where(User.username == form_data.username) - result = await session.exec(statement) - user = result.first() + query = users_ref.where("username", "==", form_data.username).limit(1).stream() + user_doc = next(query, None) # If not found by username, try finding by email - if not user: - statement = select(User).where(User.email == form_data.username) - result = await session.exec(statement) - user = result.first() + if not user_doc: + query = users_ref.where("email", "==", form_data.username).limit(1).stream() + user_doc = next(query, None) # Verify user exists and password is correct - if not user or not verify_password(form_data.password, user.password_hash): + if not user_doc: + raise HTTPException( + status_code=status.HTTP_401_UNAUTHORIZED, + detail="Incorrect username or password", + headers={"WWW-Authenticate": "Bearer"}, + ) + + user_data = user_doc.to_dict() + if not verify_password(form_data.password, user_data["password_hash"]): raise HTTPException( status_code=status.HTTP_401_UNAUTHORIZED, detail="Incorrect username or password", @@ -162,10 +175,10 @@ async def login( # Create access token access_token_expires = timedelta(minutes=settings.access_token_expire_minutes) access_token = create_access_token( - data={"sub": user.username}, expires_delta=access_token_expires + data={"sub": user_data["username"]}, expires_delta=access_token_expires ) - logger.info(f"User logged in: {user.username}") + logger.info(f"User logged in from Firestore: {user_data['username']}") return TokenResponse( access_token=access_token, diff --git a/src/api/main.py b/src/api/main.py index 18ef279d29e9ab0fc2f7cb9f2c3ca36bb3ba72af..b044f728d82bd553b4d0c9d369f4274c1a7d5283 100644 --- a/src/api/main.py +++ b/src/api/main.py @@ -8,16 +8,15 @@ from fastapi import FastAPI, HTTPException, BackgroundTasks, Depends from fastapi.middleware.cors import CORSMiddleware from pydantic import BaseModel, HttpUrl -from src.ai_modules.transcription.audio_downloader import YouTubeDownloader -from src.ai_modules.transcription.whisper_transcriber import WhisperTranscriber -from src.ai_modules.summarization.note_generator import NoteGenerator +from src.transcription.audio_downloader import YouTubeDownloader +from src.transcription.whisper_transcriber import WhisperTranscriber +from src.summarization.note_generator import NoteGenerator from src.utils.logger import setup_logger -from src.db.database import create_db_and_tables, async_engine +from src.db.firebase import get_firebase_db from src.db.models import Note, User from src.auth.dependencies import get_current_user from src.api.auth_routes import router as auth_router from src.api.notes_routes import router as notes_router -from sqlmodel.ext.asyncio.session import AsyncSession logger = setup_logger(__name__) @@ -49,8 +48,8 @@ tasks: Dict[str, Dict] = {} @asynccontextmanager async def lifespan(app: FastAPI): - logger.info("Lifespan: Initializing database tables...") - await create_db_and_tables() + logger.info("Lifespan: Initializing Firebase...") + get_firebase_db() yield @@ -102,7 +101,7 @@ async def generate( async def process_video_and_save( - task_id: str, youtube_url: str, language: str, user_id: int + task_id: str, youtube_url: str, language: str, user_id: str ): audio_file = None try: @@ -127,16 +126,18 @@ async def process_video_and_save( video_info["duration"], ) - async with AsyncSession(async_engine) as session: - new_note = Note( - user_id=user_id, - video_url=youtube_url, - video_title=video_info["title"], - summary_content=final_notes, - ) - session.add(new_note) - await session.commit() - await session.refresh(new_note) + db = get_firebase_db() + if db: + note_data = { + "user_id": user_id, + "video_url": youtube_url, + "video_title": video_info["title"], + "summary_content": final_notes, + "created_at": datetime.utcnow() + } + db.collection("notes").add(note_data) + else: + logger.warning("Firestore not initialized, note not saved to DB but generated in memory.") tasks[task_id]["notes"] = final_notes tasks[task_id]["keyPoints"] = json_notes.get("key_points", []) if isinstance(json_notes, dict) else [] diff --git a/src/api/notes_routes.py b/src/api/notes_routes.py index 631a9f358e110f6d2ca0cc9fb92fc1e69d301ef6..ba8a1713e5637301d22bca696055571bfbba9297 100644 --- a/src/api/notes_routes.py +++ b/src/api/notes_routes.py @@ -12,7 +12,7 @@ from fastapi.responses import FileResponse, JSONResponse from pydantic import BaseModel, HttpUrl, Field from sqlmodel import Session, select -from src.db.database import get_session +from src.db.firebase import get_firebase_db from src.db.models import User, Note from src.auth.dependencies import get_current_user from src.ai_modules.categorization.categorizer import CategorizationService @@ -45,13 +45,13 @@ class CreateNoteRequest(BaseModel): class NoteResponse(BaseModel): - id: int + id: str # Changed to str for Firestore IDs video_url: str video_title: str summary_text: str video_duration: Optional[int] language: str - user_id: int + user_id: str # Changed to str category: Optional[str] created_at: str @@ -125,69 +125,74 @@ async def get_generated_note_content(filename: str): # End of New Endpoints # ========================================== -# ... (Database endpoints kept for compatibility if needed later) ... -# You can leave the rest of the file as is, or I can include it below just in case. -# For brevity, I'll include the standard DB create/get just to not break anything. - @router.get("/{note_id}", response_model=NoteResponse) async def get_note( - note_id: int, - session: AsyncSession = Depends(get_session), + note_id: str, current_user: User = Depends(get_current_user), ): """ - Get a specific note by ID. + Get a specific note by ID from Firestore. """ - statement = select(Note).where(Note.id == note_id, Note.user_id == current_user.id) - result = await session.exec(statement) - note = result.first() + db = get_firebase_db() + if db is None: + raise HTTPException(status_code=500, detail="Firebase not configured") + + note_ref = db.collection("notes").document(note_id) + note_doc = note_ref.get() - if not note: + if not note_doc.exists: raise HTTPException(status_code=404, detail="Note not found") + note_data = note_doc.to_dict() + if note_data.get("user_id") != current_user.id: + raise HTTPException(status_code=403, detail="Forbidden") + return NoteResponse( - id=note.id, - video_url=note.video_url, - video_title=note.video_title, - summary_text=note.summary_content, + id=note_doc.id, + video_url=note_data["video_url"], + video_title=note_data["video_title"], + summary_text=note_data["summary_content"], video_duration=None, language="en", - user_id=note.user_id, - category=note.category, - created_at=str(note.created_at), + user_id=note_data["user_id"], + category=note_data.get("category"), + created_at=str(note_data.get("created_at")), ) @router.get("", response_model=List[NoteResponse]) async def list_user_notes( - session: AsyncSession = Depends(get_session), current_user: User = Depends(get_current_user), ): """ - List all notes belonging to the current user. + List all notes belonging to the current user from Firestore. """ - statement = ( - select(Note) - .where(Note.user_id == current_user.id) - .order_by(Note.created_at.desc()) + db = get_firebase_db() + if db is None: + return [] + + notes_ref = db.collection("notes") + query = ( + notes_ref.where("user_id", "==", current_user.id) + .order_by("created_at", direction="DESCENDING") + .stream() ) - result = await session.exec(statement) - notes = result.all() return [ NoteResponse( - id=n.id, - video_url=n.video_url, - video_title=n.video_title, - summary_text=n.summary_content, - video_duration=None, # Update if stored - language="en", # Default - user_id=n.user_id, - category=n.category, - created_at=str(n.created_at), + id=doc.id, + video_url=data["video_url"], + video_title=data["video_title"], + summary_text=data["summary_content"], + video_duration=None, + language="en", + user_id=data["user_id"], + category=data.get("category"), + created_at=str(data.get("created_at")), ) - for n in notes + for doc in query + if (data := doc.to_dict()) ] @@ -195,29 +200,33 @@ async def list_user_notes( async def create_note( note_data: CreateNoteRequest, current_user: User = Depends(get_current_user), - session: AsyncSession = Depends(get_session), ): # Automatically categorize the note category = await categorizer.categorize_text(note_data.summary_text) - new_note = Note( - video_url=str(note_data.video_url), - video_title=note_data.video_title, - summary_content=note_data.summary_text, - user_id=current_user.id, - category=category, - ) - session.add(new_note) - await session.commit() - await session.refresh(new_note) + db = get_firebase_db() + if db is None: + raise HTTPException(status_code=500, detail="Firebase not configured") + + note_dict = { + "video_url": str(note_data.video_url), + "video_title": note_data.video_title, + "summary_content": note_data.summary_text, + "user_id": current_user.id, + "category": category, + "created_at": datetime.utcnow() + } + + _, new_note_ref = db.collection("notes").add(note_dict) + return NoteResponse( - id=new_note.id, - video_url=new_note.video_url, - video_title=new_note.video_title, - summary_text=new_note.summary_content, + id=new_note_ref.id, + video_url=note_dict["video_url"], + video_title=note_dict["video_title"], + summary_text=note_dict["summary_content"], video_duration=None, language="en", - user_id=new_note.user_id, - category=new_note.category, - created_at=str(new_note.created_at), + user_id=note_dict["user_id"], + category=note_dict["category"], + created_at=str(note_dict["created_at"]), ) diff --git a/src/auth/__pycache__/dependencies.cpython-312.pyc b/src/auth/__pycache__/dependencies.cpython-312.pyc index b818ec45b44b5e081753c9d92ae684442c035310..0b79521c169544816b3c2c4f10b98d24de25ae0c 100644 Binary files a/src/auth/__pycache__/dependencies.cpython-312.pyc and b/src/auth/__pycache__/dependencies.cpython-312.pyc differ diff --git a/src/auth/dependencies.py b/src/auth/dependencies.py index c594320f4354636cf2ab31832d90cfdb8e465d0f..2812d8545f07081b23525816b9d4fab22e20a374 100644 --- a/src/auth/dependencies.py +++ b/src/auth/dependencies.py @@ -7,9 +7,8 @@ from fastapi import Depends, HTTPException, status from fastapi.security import OAuth2PasswordBearer from sqlmodel import select -from src.db.database import get_session +from src.db.firebase import get_firebase_db from src.db.models import User -from sqlmodel.ext.asyncio.session import AsyncSession from src.auth.security import decode_access_token # OAuth2 scheme for extracting bearer tokens from Authorization header @@ -17,28 +16,10 @@ oauth2_scheme = OAuth2PasswordBearer(tokenUrl="/auth/login") async def get_current_user( - token: str = Depends(oauth2_scheme), session: AsyncSession = Depends(get_session) + token: str = Depends(oauth2_scheme) ) -> User: """ - Get the currently authenticated user from JWT token. - - This dependency extracts the JWT token from the Authorization header, - validates it, and retrieves the corresponding user from the database. - - Args: - token: JWT token from Authorization header - session: Database session - - Returns: - User object if authentication is successful - - Raises: - HTTPException: 401 Unauthorized if token is invalid or user not found - - Usage: - @app.get("/protected") - async def protected_route(current_user: User = Depends(get_current_user)): - return {"message": f"Hello {current_user.username}"} + Get the currently authenticated user from JWT token (using Firestore). """ credentials_exception = HTTPException( status_code=status.HTTP_401_UNAUTHORIZED, @@ -56,15 +37,28 @@ async def get_current_user( if username is None: raise credentials_exception - # Retrieve user from database - statement = select(User).where(User.username == username) - result = await session.exec(statement) - user = result.first() - - if user is None: + # Retrieve user from Firestore + db = get_firebase_db() + if db is None: + # Fallback for when Firebase is not configured + return User( + id="mock_id", + email="mock@example.com", + username=username, + password_hash="mock", + role="user" + ) + + users_ref = db.collection("users") + query = users_ref.where("username", "==", username).limit(1).stream() + user_doc = next(query, None) + + if user_doc is None: raise credentials_exception - return user + user_data = user_doc.to_dict() + user_data["id"] = user_doc.id + return User(**user_data) async def get_current_active_user( diff --git a/src/ai_modules/categorization/README.md b/src/categorization/README.md similarity index 100% rename from src/ai_modules/categorization/README.md rename to src/categorization/README.md diff --git a/src/ai_modules/__init__.py b/src/categorization/__init__.py similarity index 100% rename from src/ai_modules/__init__.py rename to src/categorization/__init__.py diff --git a/src/ai_modules/categorization/categorizer.py b/src/categorization/categorizer.py similarity index 100% rename from src/ai_modules/categorization/categorizer.py rename to src/categorization/categorizer.py diff --git a/src/db/__pycache__/models.cpython-312.pyc b/src/db/__pycache__/models.cpython-312.pyc index 48d05af0a4c60b5a430041f76b0963414634d1ed..3e1ffe024306d4f25556b2dc6ca1a3579a81cf5b 100644 Binary files a/src/db/__pycache__/models.cpython-312.pyc and b/src/db/__pycache__/models.cpython-312.pyc differ diff --git a/src/db/firebase.py b/src/db/firebase.py new file mode 100644 index 0000000000000000000000000000000000000000..895c85dbbc12a6285a8d6a680187ec682e5c4dc6 --- /dev/null +++ b/src/db/firebase.py @@ -0,0 +1,53 @@ +""" +Firebase initialization and helper functions. +""" + +import firebase_admin +from firebase_admin import credentials, firestore, auth +from pathlib import Path +from src.utils.config import settings +from src.utils.logger import setup_logger + +logger = setup_logger(__name__) + +_db = None + +def get_firebase_db(): + """ + Initialize Firebase Admin SDK and return Firestore client. + """ + global _db + if _db is not None: + return _db + + try: + service_account_path = Path(settings.firebase_service_account_path) + + if not service_account_path.exists(): + logger.warning(f"Firebase service account file not found at {service_account_path}. Using fallback/mock behavior if applicable.") + # In a real production app, we might want to raise an error here. + # For now, we'll return None and handle it in the services. + return None + + cred = credentials.Certificate(str(service_account_path)) + firebase_admin.initialize_app(cred, { + 'storageBucket': settings.firebase_storage_bucket + }) + + _db = firestore.client() + logger.info("Firebase initialized successfully.") + return _db + except Exception as e: + logger.error(f"Failed to initialize Firebase: {e}") + return None + +def verify_token(id_token: str): + """ + Verify a Firebase ID token. + """ + try: + decoded_token = auth.verify_id_token(id_token) + return decoded_token + except Exception as e: + logger.error(f"Token verification failed: {e}") + return None diff --git a/src/db/models.py b/src/db/models.py index 8695d33ee4f3ba8d68f184a2d2cfdbb824e1ba03..68bdcbe97e8a8474875a947061142bf872964984 100644 --- a/src/db/models.py +++ b/src/db/models.py @@ -5,48 +5,41 @@ Optimized for cloud deployment and mobile app integration. from datetime import datetime from typing import Optional, List -from sqlmodel import SQLModel, Field, Relationship +from pydantic import BaseModel, Field -class User(SQLModel, table=True): - __tablename__ = "users" +class User(BaseModel): + """User model for Firestore 'users' collection.""" + id: Optional[str] = Field(default=None) + email: str + username: str + password_hash: str + role: str = "user" + age: Optional[int] = None + gender: Optional[str] = None + created_at: datetime = Field(default_factory=datetime.utcnow) - id: Optional[int] = Field(default=None, primary_key=True) - email: str = Field(unique=True, index=True, max_length=255, nullable=False) - username: str = Field(max_length=100, nullable=False) - password_hash: str = Field(max_length=255, nullable=False) - role: str = Field(default="user", max_length=50, nullable=False) - age: Optional[int] = Field(default=None) - gender: Optional[str] = Field(default=None, max_length=20) - created_at: datetime = Field(default_factory=datetime.utcnow, nullable=False) + class Config: + from_attributes = True - notes: List["Note"] = Relationship(back_populates="owner") +class Category(BaseModel): + """Category model for Firestore 'categories' collection.""" + id: Optional[str] = Field(default=None) + name: str + description: Optional[str] = None + user_id: str + created_at: datetime = Field(default_factory=datetime.utcnow) -class Category(SQLModel, table=True): - __tablename__ = "categories" - id: Optional[int] = Field(default=None, primary_key=True) - name: str = Field(index=True, max_length=100, nullable=False) - description: Optional[str] = Field(default=None, max_length=500) - user_id: int = Field(foreign_key="users.id", index=True, nullable=False) - created_at: datetime = Field(default_factory=datetime.utcnow, nullable=False) - - -class Note(SQLModel, table=True): +class Note(BaseModel): """ - Note model synchronized with Supabase schema. - Removed content_json and keywords to prevent UndefinedColumnError. + Note model for Firestore 'notes' collection. """ - - __tablename__ = "notes" - - id: Optional[int] = Field(default=None, primary_key=True) - user_id: int = Field(foreign_key="users.id", index=True, nullable=False) - video_url: str = Field(index=True, max_length=500, nullable=False) - video_title: str = Field(max_length=500, nullable=False) - summary_content: str = Field(nullable=False) - category: Optional[str] = Field(default="Uncategorized", max_length=100) - created_at: datetime = Field(default_factory=datetime.utcnow, nullable=False) - - owner: Optional[User] = Relationship(back_populates="notes") + id: Optional[str] = Field(default=None) + user_id: str + video_url: str + video_title: str + summary_content: str + category: Optional[str] = "Uncategorized" + created_at: datetime = Field(default_factory=datetime.utcnow) diff --git a/src/ai_modules/recommendation/README.md b/src/recommendation/README.md similarity index 100% rename from src/ai_modules/recommendation/README.md rename to src/recommendation/README.md diff --git a/src/ai_modules/categorization/__init__.py b/src/recommendation/__init__.py similarity index 100% rename from src/ai_modules/categorization/__init__.py rename to src/recommendation/__init__.py diff --git a/src/ai_modules/recommendation/recommender.py b/src/recommendation/recommender.py similarity index 100% rename from src/ai_modules/recommendation/recommender.py rename to src/recommendation/recommender.py diff --git a/src/ai_modules/summarization/README.md b/src/summarization/README.md similarity index 100% rename from src/ai_modules/summarization/README.md rename to src/summarization/README.md diff --git a/src/ai_modules/recommendation/__init__.py b/src/summarization/__init__.py similarity index 100% rename from src/ai_modules/recommendation/__init__.py rename to src/summarization/__init__.py diff --git a/src/summarization/__pycache__/__init__.cpython-312.pyc b/src/summarization/__pycache__/__init__.cpython-312.pyc index 68a8e4dec4971561d1fec7d07848e2d5a4327159..6c256c13f30afcabb49128663d3484af91b894fb 100644 Binary files a/src/summarization/__pycache__/__init__.cpython-312.pyc and b/src/summarization/__pycache__/__init__.cpython-312.pyc differ diff --git a/src/summarization/__pycache__/__init__.cpython-314.pyc b/src/summarization/__pycache__/__init__.cpython-314.pyc index 7466701d9da682e1811ae5b538956b52400f92b1..c2226d7ed96bdabbd1b47f659734cd3ea8345d93 100644 Binary files a/src/summarization/__pycache__/__init__.cpython-314.pyc and b/src/summarization/__pycache__/__init__.cpython-314.pyc differ diff --git a/src/summarization/__pycache__/note_generator.cpython-312.pyc b/src/summarization/__pycache__/note_generator.cpython-312.pyc index 9a4244095805b96f3839fdc10f32ad113d9e718a..46f3be0175cf4b3788cdcf39f1b53231acbd2406 100644 Binary files a/src/summarization/__pycache__/note_generator.cpython-312.pyc and b/src/summarization/__pycache__/note_generator.cpython-312.pyc differ diff --git a/src/summarization/__pycache__/note_generator.cpython-314.pyc b/src/summarization/__pycache__/note_generator.cpython-314.pyc index 6179e9859b1af913138c752c7e14c0f76fba1d8b..a8806679becd2c567295a309b49e645779a6df9d 100644 Binary files a/src/summarization/__pycache__/note_generator.cpython-314.pyc and b/src/summarization/__pycache__/note_generator.cpython-314.pyc differ diff --git a/src/summarization/__pycache__/schemas.cpython-312.pyc b/src/summarization/__pycache__/schemas.cpython-312.pyc index e57bc3c00f6782429fbb5ae66ca3c2f1995e98e2..b97d1c5402d1e0d655e0595249cd1943073627b4 100644 Binary files a/src/summarization/__pycache__/schemas.cpython-312.pyc and b/src/summarization/__pycache__/schemas.cpython-312.pyc differ diff --git a/src/ai_modules/summarization/note_generator.py b/src/summarization/note_generator.py similarity index 98% rename from src/ai_modules/summarization/note_generator.py rename to src/summarization/note_generator.py index a1db03c501a2d9af855d200850def93a6f0fc0f4..607d463a5a722d0305dac22acfd20fc8c8303f80 100644 --- a/src/ai_modules/summarization/note_generator.py +++ b/src/summarization/note_generator.py @@ -5,7 +5,7 @@ from pydantic import ValidationError from src.utils.logger import setup_logger from src.utils.config import settings -from src.ai_modules.summarization.schemas import StudyNoteSchema +from src.summarization.schemas import StudyNoteSchema logger = setup_logger(__name__) diff --git a/src/ai_modules/summarization/schemas.py b/src/summarization/schemas.py similarity index 100% rename from src/ai_modules/summarization/schemas.py rename to src/summarization/schemas.py diff --git a/src/ai_modules/summarization/segmenter.py b/src/summarization/segmenter.py similarity index 100% rename from src/ai_modules/summarization/segmenter.py rename to src/summarization/segmenter.py diff --git a/src/ai_modules/transcription/README.md b/src/transcription/README.md similarity index 100% rename from src/ai_modules/transcription/README.md rename to src/transcription/README.md diff --git a/src/ai_modules/summarization/__init__.py b/src/transcription/__init__.py similarity index 100% rename from src/ai_modules/summarization/__init__.py rename to src/transcription/__init__.py diff --git a/src/transcription/__pycache__/__init__.cpython-312.pyc b/src/transcription/__pycache__/__init__.cpython-312.pyc index 67e5f1aedc3029e4c44a2f2058aa7de93e7221cb..5b5816e7e925f1549065106c14e98a2bd4bb2de8 100644 Binary files a/src/transcription/__pycache__/__init__.cpython-312.pyc and b/src/transcription/__pycache__/__init__.cpython-312.pyc differ diff --git a/src/transcription/__pycache__/__init__.cpython-314.pyc b/src/transcription/__pycache__/__init__.cpython-314.pyc index 06b85bdd84e7d40763cfa06f70c7cad5aa551831..0d3e8d5e19be8faddfde3d03269e5a71e0d328aa 100644 Binary files a/src/transcription/__pycache__/__init__.cpython-314.pyc and b/src/transcription/__pycache__/__init__.cpython-314.pyc differ diff --git a/src/transcription/__pycache__/whisper_transcriber.cpython-312.pyc b/src/transcription/__pycache__/whisper_transcriber.cpython-312.pyc index 67af0015d5cb349ce422f9a4b922a411f4e7b918..938062b2a5f6ed4e81628397c376fdffd20504d4 100644 Binary files a/src/transcription/__pycache__/whisper_transcriber.cpython-312.pyc and b/src/transcription/__pycache__/whisper_transcriber.cpython-312.pyc differ diff --git a/src/transcription/__pycache__/whisper_transcriber.cpython-314.pyc b/src/transcription/__pycache__/whisper_transcriber.cpython-314.pyc index b5f89ad0c6bd1a80287ddd9bb1dfaee55739054e..b9fdf14f1dde47ada3d0a2275fbc909f6d27ca8e 100644 Binary files a/src/transcription/__pycache__/whisper_transcriber.cpython-314.pyc and b/src/transcription/__pycache__/whisper_transcriber.cpython-314.pyc differ diff --git a/src/ai_modules/transcription/audio_downloader.py b/src/transcription/audio_downloader.py similarity index 100% rename from src/ai_modules/transcription/audio_downloader.py rename to src/transcription/audio_downloader.py diff --git a/src/ai_modules/transcription/audio_processor.py b/src/transcription/audio_processor.py similarity index 100% rename from src/ai_modules/transcription/audio_processor.py rename to src/transcription/audio_processor.py diff --git a/src/ai_modules/transcription/whisper_transcriber.py b/src/transcription/whisper_transcriber.py similarity index 100% rename from src/ai_modules/transcription/whisper_transcriber.py rename to src/transcription/whisper_transcriber.py diff --git a/src/utils/__pycache__/config.cpython-312.pyc b/src/utils/__pycache__/config.cpython-312.pyc index cd15904be2b663ee2b64046faec3b2e3ef57bd44..9f3536241d06586d20df79799ea3cf5f836e1cea 100644 Binary files a/src/utils/__pycache__/config.cpython-312.pyc and b/src/utils/__pycache__/config.cpython-312.pyc differ diff --git a/src/utils/__pycache__/config.cpython-314.pyc b/src/utils/__pycache__/config.cpython-314.pyc index 799c58fd790bd0d4f7382f9af984f610df47af7d..b8cc1dcddb1cafbdf68fd19f6819c9d276fef633 100644 Binary files a/src/utils/__pycache__/config.cpython-314.pyc and b/src/utils/__pycache__/config.cpython-314.pyc differ diff --git a/src/utils/config.py b/src/utils/config.py index 9c559922f7d2fb2e23a0e40e2a15a356c388a080..891b468693d6e22230b57000acaad1b073c18219 100644 --- a/src/utils/config.py +++ b/src/utils/config.py @@ -62,10 +62,24 @@ class Settings(BaseSettings): description="FastAPI port number" ) - # Database Configuration + # Database Configuration (PostgreSQL - Deprecated in favor of Firebase) database_url: str = Field( default="postgresql+asyncpg://postgres:password@localhost:5432/studynotes", - description="PostgreSQL database connection URL (use asyncpg driver)" + description="PostgreSQL database connection URL (Deprecated)" + ) + + # Firebase Configuration + firebase_project_id: str = Field( + default="aidea-project-id", + description="Firebase Project ID" + ) + firebase_service_account_path: str = Field( + default="serviceAccountKey.json", + description="Path to Firebase service account JSON file" + ) + firebase_storage_bucket: str = Field( + default="", + description="Firebase Storage bucket name" ) # Authentication Configuration @@ -91,7 +105,8 @@ class Settings(BaseSettings): model_config = SettingsConfigDict( env_file=".env", env_file_encoding="utf-8", - case_sensitive=False + case_sensitive=False, + extra="ignore" ) def __init__(self, **kwargs): diff --git a/temp/.gitkeep b/temp/.gitkeep new file mode 100644 index 0000000000000000000000000000000000000000..f6456903e2c344a08be229af36726fd76d598aa4 --- /dev/null +++ b/temp/.gitkeep @@ -0,0 +1,2 @@ +# Temporary files directory +# Audio files are temporarily stored here during processing diff --git a/tests/test_gemini.py b/tests/test_gemini.py new file mode 100644 index 0000000000000000000000000000000000000000..a174f4db952a422e313786e728bafd50f0b20bb6 --- /dev/null +++ b/tests/test_gemini.py @@ -0,0 +1,29 @@ +import os +import asyncio +from google import genai +from dotenv import load_dotenv + + +async def test_gemini(): + load_dotenv() + api_key = os.getenv("GOOGLE_API_KEY") + if not api_key: + print("❌ Error: GOOGLE_API_KEY not found in .env") + return + + client = genai.Client(api_key=api_key) + model_id = "gemini-flash-latest" + + print(f"Testing Gemini with model: {model_id}...") + try: + response = client.models.generate_content( + model=model_id, + contents="Hello! Can you confirm you are working? Reply with 'Yes, I am working!'", + ) + print(f"βœ… Success! Response: {response.text}") + except Exception as e: + print(f"❌ Gemini Error: {e}") + + +if __name__ == "__main__": + asyncio.run(test_gemini()) diff --git a/tests/test_pipeline.py b/tests/test_pipeline.py new file mode 100644 index 0000000000000000000000000000000000000000..2e94448eaecee924de823c4b4a78def8b966c4c2 --- /dev/null +++ b/tests/test_pipeline.py @@ -0,0 +1,143 @@ +""" +End-to-end pipeline test for YouTube study notes generation. +""" + +import sys +from pathlib import Path + +# Add src to path +sys.path.insert(0, str(Path(__file__).parent.parent)) + +from src.transcription.audio_downloader import YouTubeDownloader +from src.transcription.audio_processor import AudioProcessor +from src.transcription.whisper_transcriber import WhisperTranscriber +from src.summarization.segmenter import TranscriptSegmenter +from src.summarization.note_generator import NoteGenerator +from src.utils.logger import setup_logger +from src.utils.config import settings + +logger = setup_logger(__name__) + + +def test_pipeline(youtube_url: str): + """ + Test the complete pipeline from YouTube URL to study notes. + + Args: + youtube_url: YouTube video URL to process + """ + logger.info("=" * 60) + logger.info("STARTING PIPELINE TEST") + logger.info("=" * 60) + + audio_file = None + + try: + # Step 1: Download audio + logger.info("\n[1/5] Downloading audio from YouTube...") + downloader = YouTubeDownloader() + + if not downloader.is_valid_youtube_url(youtube_url): + raise ValueError("Invalid YouTube URL") + + video_info = downloader.get_video_info(youtube_url) + logger.info(f"Video: {video_info['title']}") + logger.info(f"Duration: {video_info['duration']}s") + + audio_file = downloader.download_audio(youtube_url, "test_video") + logger.info(f"βœ… Audio downloaded: {audio_file}") + + # Step 2: Validate audio + logger.info("\n[2/5] Validating audio file...") + processor = AudioProcessor() + + if not processor.validate_audio_file(audio_file): + raise ValueError("Audio validation failed") + + audio_info = processor.get_audio_info(audio_file) + logger.info(f"βœ… Audio validated: {audio_info['duration']:.2f}s, {audio_info['framerate']}Hz") + + # Step 3: Transcribe + logger.info("\n[3/5] Transcribing audio with Whisper...") + transcriber = WhisperTranscriber() + transcript_data = transcriber.transcribe(audio_file, language="en", verbose=True) + + logger.info(f"βœ… Transcription complete:") + logger.info(f" - Text length: {len(transcript_data['text'])} characters") + logger.info(f" - Segments: {len(transcript_data['segments'])}") + logger.info(f" - Language: {transcript_data['language']}") + + # Step 4: Segment transcript + logger.info("\n[4/5] Segmenting transcript...") + segmenter = TranscriptSegmenter() + + # Clean text + cleaned_text = segmenter.clean_text(transcript_data['text']) + logger.info(f"Text cleaned: {len(transcript_data['text'])} β†’ {len(cleaned_text)} chars") + + # Segment + segments = segmenter.segment_transcript(transcript_data, method="time") + logger.info(f"βœ… Created {len(segments)} segments") + + # Step 5: Generate notes + logger.info("\n[5/5] Generating study notes with Gemini...") + note_gen = NoteGenerator() + + if len(transcript_data['text'].split()) < 2000: + notes = note_gen.generate_notes_from_full_transcript( + transcript_data['text'], + video_info['title'] + ) + else: + notes = note_gen.generate_notes_from_segments(segments) + + logger.info(f"βœ… Notes generated: {len(notes)} characters") + + # Format final notes + final_notes = note_gen.format_final_notes( + notes, + video_info['title'], + youtube_url, + video_info['duration'] + ) + + # Save notes + output_file = settings.output_dir / "test_output_notes.md" + output_file.write_text(final_notes, encoding='utf-8') + + logger.info(f"\nβœ… SUCCESS! Notes saved to: {output_file}") + + # Show preview + logger.info("\n" + "=" * 60) + logger.info("NOTES PREVIEW (first 500 chars)") + logger.info("=" * 60) + logger.info(final_notes[:500] + "...") + + return True + + except Exception as e: + logger.error(f"\n❌ Pipeline test failed: {e}") + import traceback + traceback.print_exc() + return False + + finally: + # Cleanup + if audio_file and audio_file.exists(): + logger.info("\nCleaning up temporary files...") + downloader.cleanup(audio_file) + + +if __name__ == "__main__": + # Test with a short educational video + # Replace with an actual YouTube URL + TEST_URL = "https://www.youtube.com/watch?v=aircAruvnKk" # Example: 3Blue1Brown + + if len(sys.argv) > 1: + TEST_URL = sys.argv[1] + + logger.info(f"Test URL: {TEST_URL}") + + success = test_pipeline(TEST_URL) + + sys.exit(0 if success else 1)