Spaces:
Sleeping
Sleeping
| from pptx import Presentation | |
| from pptx.util import Inches, Pt | |
| from pptx.enum.text import PP_ALIGN | |
| from pptx.dml.color import RGBColor | |
| # Create presentation | |
| prs = Presentation() | |
| prs.slide_width = Inches(10) | |
| prs.slide_height = Inches(7.5) | |
| def add_title_slide(title, subtitle): | |
| """Add a title slide""" | |
| slide = prs.slides.add_slide(prs.slide_layouts[6]) # Blank layout | |
| background = slide.background | |
| fill = background.fill | |
| fill.solid() | |
| fill.fore_color.rgb = RGBColor(51, 152, 219) # Blue background | |
| # Add title | |
| title_box = slide.shapes.add_textbox(Inches(0.5), Inches(2.5), Inches(9), Inches(1.5)) | |
| title_frame = title_box.text_frame | |
| title_frame.text = title | |
| title_frame.paragraphs[0].font.size = Pt(60) | |
| title_frame.paragraphs[0].font.bold = True | |
| title_frame.paragraphs[0].font.color.rgb = RGBColor(255, 255, 255) | |
| title_frame.paragraphs[0].alignment = PP_ALIGN.CENTER | |
| # Add subtitle | |
| subtitle_box = slide.shapes.add_textbox(Inches(0.5), Inches(4.2), Inches(9), Inches(1)) | |
| subtitle_frame = subtitle_box.text_frame | |
| subtitle_frame.text = subtitle | |
| subtitle_frame.paragraphs[0].font.size = Pt(32) | |
| subtitle_frame.paragraphs[0].font.color.rgb = RGBColor(255, 255, 255) | |
| subtitle_frame.paragraphs[0].alignment = PP_ALIGN.CENTER | |
| def add_content_slide(title, content_list): | |
| """Add a slide with title and bullet points""" | |
| slide = prs.slides.add_slide(prs.slide_layouts[6]) # Blank layout | |
| background = slide.background | |
| fill = background.fill | |
| fill.solid() | |
| fill.fore_color.rgb = RGBColor(240, 240, 240) | |
| # Add title | |
| title_box = slide.shapes.add_textbox(Inches(0.5), Inches(0.5), Inches(9), Inches(0.8)) | |
| title_frame = title_box.text_frame | |
| title_frame.text = title | |
| title_frame.paragraphs[0].font.size = Pt(44) | |
| title_frame.paragraphs[0].font.bold = True | |
| title_frame.paragraphs[0].font.color.rgb = RGBColor(44, 62, 80) | |
| # Add content | |
| content_box = slide.shapes.add_textbox(Inches(1), Inches(1.5), Inches(8), Inches(5.5)) | |
| text_frame = content_box.text_frame | |
| text_frame.word_wrap = True | |
| for i, item in enumerate(content_list): | |
| if i > 0: | |
| text_frame.add_paragraph() | |
| p = text_frame.paragraphs[i] | |
| p.text = item | |
| p.font.size = Pt(22) | |
| p.font.color.rgb = RGBColor(52, 73, 94) | |
| p.space_before = Pt(12) | |
| p.space_after = Pt(12) | |
| p.level = 0 | |
| # Slide 1: Title | |
| add_title_slide("π¦ Bird Species Classifier", "Complete Project Logic & Architecture") | |
| # Slide 2: Project Overview | |
| add_content_slide("Project Overview", [ | |
| "β AI-powered application for identifying bird species", | |
| "β Accepts TWO inputs: Bird images and bird sounds", | |
| "β Uses Deep Learning models to classify species", | |
| "β Available in TWO formats: Desktop (Tkinter) & Web (Flask)", | |
| "β Real-time predictions with high accuracy" | |
| ]) | |
| # Slide 3: Key Features | |
| add_content_slide("Key Features", [ | |
| "π· Image Classification - Identify birds from photos", | |
| "π΅ Audio Classification - Identify birds from sound recordings", | |
| "π Real-time Processing - Instant predictions", | |
| "π¨ User-friendly Interface - Easy to use for everyone", | |
| "β‘ Multi-threaded Loading - Smooth user experience", | |
| "βοΈ Uses Pre-trained Models - No training needed" | |
| ]) | |
| # Slide 4: Architecture Overview | |
| add_content_slide("System Architecture", [ | |
| "INPUT LAYER", | |
| " β’ Image File (.jpg, .png, .bmp)", | |
| " β’ Audio File (.wav, .flac, .mp3)", | |
| "", | |
| "PROCESSING LAYER", | |
| " β’ Image Processor / Audio Feature Extractor", | |
| "", | |
| "AI MODEL LAYER", | |
| " β’ Vision Transformer (Image)", | |
| " β’ wav2vec2 Model (Audio)", | |
| "", | |
| "OUTPUT LAYER", | |
| " β’ Bird Species Prediction" | |
| ]) | |
| # Slide 5: Image Classification Logic | |
| add_content_slide("Image Classification Pipeline", [ | |
| "1. USER UPLOADS IMAGE", | |
| " β Select bird photo file", | |
| "", | |
| "2. IMAGE PREPROCESSING", | |
| " β Load image using PIL", | |
| " β Convert to RGB format", | |
| " β Resize & normalize using AutoImageProcessor", | |
| "", | |
| "3. MODEL INFERENCE", | |
| " β Pass processed image to ViT/CNN model", | |
| " β Extract feature embeddings", | |
| "", | |
| "4. PREDICTION", | |
| " β Get logits (raw scores) from model", | |
| " β Apply argmax to get highest probability class", | |
| " β Return bird species name" | |
| ]) | |
| # Slide 6: Audio Classification Logic | |
| add_content_slide("Audio Classification Pipeline", [ | |
| "1. USER UPLOADS AUDIO FILE", | |
| " β Select .wav, .flac, or .mp3 file", | |
| "", | |
| "2. AUDIO PREPROCESSING (librosa)", | |
| " β Load audio file", | |
| " β Resample to 16kHz (standard rate)", | |
| " β Convert to MONO (single channel)", | |
| " β Extract sound features", | |
| "", | |
| "3. FEATURE EXTRACTION (wav2vec2)", | |
| " β Process audio through feature extractor", | |
| " β Convert to numerical embeddings", | |
| "", | |
| "4. MODEL INFERENCE", | |
| " β Pass embeddings to audio classification model", | |
| " β Get prediction logits", | |
| "", | |
| "5. OUTPUT SPECIES", | |
| " β argmax() gets highest probability" | |
| ]) | |
| # Slide 7: Models Deep Dive | |
| add_content_slide("AI Models Used", [ | |
| "IMAGE MODEL: chriamue/bird-species-classifier", | |
| " β’ Type: Vision Transformer (ViT)", | |
| " β’ Uses: Self-attention mechanism for images", | |
| " β’ Divides image into patches β Analyzes relationships", | |
| "", | |
| "AUDIO MODEL: greenarcade/wav2vec2-vd-bird-sound", | |
| " β’ Type: wav2vec2 (Self-supervised learning)", | |
| " β’ Uses: Transformer architecture on audio", | |
| " β’ Pre-trained on large unlabeled audio β Fine-tuned for birds" | |
| ]) | |
| # Slide 8: Technologies & Libraries | |
| add_content_slide("Tech Stack", [ | |
| "π BACKEND", | |
| " β’ Python 3.13", | |
| " β’ PyTorch - Deep learning framework", | |
| " β’ Transformers - Pre-trained models library", | |
| " β’ librosa - Audio signal processing", | |
| "", | |
| "π₯οΈ FRONTEND", | |
| " β’ Tkinter - Desktop GUI (main.py)", | |
| " β’ Flask - Web framework (app.py)", | |
| " β’ HTML/CSS/JavaScript - Web interface" | |
| ]) | |
| # Slide 9: Desktop App (Tkinter) | |
| add_content_slide("Desktop Version (main.py)", [ | |
| "UI Components:", | |
| " β’ Upload Image Button β Opens file dialog", | |
| " β’ Upload Audio Button β Opens file dialog", | |
| " β’ Preview Area β Shows selected file", | |
| " β’ Result Area β Shows prediction + species name", | |
| " β’ Next Bird Button β Reset for new prediction", | |
| "", | |
| "Workflow:", | |
| " 1. App loads models on startup (background thread)", | |
| " 2. User selects image/audio", | |
| " 3. Processing happens in background", | |
| " 4. Result displays in GUI", | |
| " 5. User can predict again" | |
| ]) | |
| # Slide 10: Web App (Flask) | |
| add_content_slide("Web Version (app.py)", [ | |
| "Backend Endpoints:", | |
| " β’ / β Serves HTML interface", | |
| " β’ /status β Check if models are loaded", | |
| " β’ /classify-image β Process image", | |
| " β’ /classify-audio β Process audio", | |
| "", | |
| "Frontend (JavaScript):", | |
| " β’ Polls /status until models load", | |
| " β’ Sends files to Flask via POST requests", | |
| " β’ Displays results in browser", | |
| "", | |
| "Run: python app.py", | |
| "Access: http://localhost:5000" | |
| ]) | |
| # Slide 11: Data Flow Diagram | |
| add_content_slide("Complete Data Flow", [ | |
| "USER INPUT (Image/Audio)", | |
| " β", | |
| "FILE UPLOAD (Form/Dialog)", | |
| " β", | |
| "PREPROCESSING (PIL/librosa)", | |
| " β", | |
| "FEATURE EXTRACTION (AutoProcessor/AutoExtractor)", | |
| " β", | |
| "DEEP LEARNING MODEL (ViT/wav2vec2)", | |
| " β", | |
| "OUTPUT PREDICTIONS (Logits)", | |
| " β", | |
| "ARGMAX (Get highest probability)", | |
| " β", | |
| "SPECIES NAME (Display to user)" | |
| ]) | |
| # Slide 12: Key Concepts Explained | |
| add_content_slide("Key AI Concepts", [ | |
| "TRANSFORMER: Neural network architecture using attention", | |
| " β Analyzes relationships between all input elements", | |
| "", | |
| "VISION TRANSFORMER (ViT): Applies transformers to images", | |
| " β Treats image patches like words in sentences", | |
| "", | |
| "WAV2VEC2: Self-supervised audio representation learning", | |
| " β Learns from unlabeled audio data", | |
| " β Converts sound to meaningful features", | |
| "", | |
| "INFERENCE: Using trained model to make predictions", | |
| " β No training happening, just prediction", | |
| "", | |
| "ARGMAX: Gets index of highest value in array", | |
| " β Converts [0.1, 0.8, 0.1] β index 1 (bird species)" | |
| ]) | |
| # Slide 13: Performance Metrics | |
| add_content_slide("Model Performance", [ | |
| "IMAGE CLASSIFICATION:", | |
| " β’ Models: Trained on thousands of bird images", | |
| " β’ Accuracy: High confidence for clear photos", | |
| " β’ Speed: ~1-2 seconds per image", | |
| "", | |
| "AUDIO CLASSIFICATION:", | |
| " β’ Models: Fine-tuned on bird sound datasets", | |
| " β’ Accuracy: Excellent for 5-30 second clips", | |
| " β’ Speed: ~1-3 seconds per audio file", | |
| "", | |
| "NOTE: Quality of input affects accuracy", | |
| " β Clear images = Better results", | |
| " β Clean audio = Better results" | |
| ]) | |
| # Slide 14: Error Handling | |
| add_content_slide("Error Handling & Edge Cases", [ | |
| "Image Issues:", | |
| " β’ Invalid format β Error message shown", | |
| " β’ Corrupted file β Exception caught and reported", | |
| "", | |
| "Audio Issues:", | |
| " β’ Unsupported format β librosa fails gracefully", | |
| " β’ Empty audio β Validation check prevents crash", | |
| " β’ Very short clips β Model may give low confidence", | |
| "", | |
| "Model Loading:", | |
| " β’ Networks offline β Downloads cached models", | |
| " β’ Large files β Streamed from Hugging Face", | |
| " β’ Loading status β Status bar shows progress" | |
| ]) | |
| # Slide 15: Future Improvements | |
| add_content_slide("Potential Enhancements", [ | |
| "π§ TECHNICAL", | |
| " β’ Add confidence scores to predictions", | |
| " β’ Support for batch processing (multiple files)", | |
| " β’ Model quantization for faster inference", | |
| " β’ GPU acceleration for speedup", | |
| "", | |
| "π FEATURES", | |
| " β’ Database to store prediction history", | |
| " β’ User authentication for web version", | |
| " β’ Mobile app (React Native / Flutter)", | |
| " β’ Real-time bird detection from camera/mic", | |
| " β’ Multi-species probability rankings" | |
| ]) | |
| # Slide 16: Summary | |
| add_content_slide("Summary", [ | |
| "β TWO INPUT TYPES: Images & Audio recordings", | |
| "β TWO INTERFACES: Desktop (Tkinter) & Web (Flask)", | |
| "β STATE-OF-ART MODELS: Vision Transformer + wav2vec2", | |
| "β AUTOMATIC PREPROCESSING: Handles all formats", | |
| "β REAL-TIME PREDICTIONS: Instant results", | |
| "β USER-FRIENDLY: No ML knowledge needed", | |
| "β SCALABLE: Can serve multiple users (web version)" | |
| ]) | |
| # Slide 17: Questions | |
| add_title_slide("Questions?", "π¦ Bird Species Classifier π¦") | |
| # Save presentation | |
| prs.save('Bird_Species_Classifier_PPT.pptx') | |
| print("β PowerPoint created: Bird_Species_Classifier_PPT.pptx") | |