Spaces:

prathameshv07
/

Multilingual-Audio-Intelligence-System

Running

App Files Files Community

Prathamesh Sarjerao Vaidya commited on Sep 5, 2025

Commit

65fbbac

1 Parent(s): 65f46e8

made changes

Browse files

Files changed (5) hide show

.github/workflows/check.yml +58 -3
.github/workflows/main.yml +48 -3
Dockerfile +6 -0
model_preloader.py +57 -27
startup.py +30 -8

.github/workflows/check.yml CHANGED Viewed

@@ -20,28 +20,77 @@ jobs:
     runs-on: ubuntu-latest
     needs: check-file-size
     if: github.event_name == 'pull_request'
     steps:
       - uses: actions/checkout@v3
         with:
           lfs: true
       - name: Pull LFS files
         run: |
           git lfs install
           git lfs pull
       - name: Setup Python
         uses: actions/setup-python@v4
         with:
           python-version: '3.11'
       - name: Setup system dependencies
-        run: chmod +x .github/workflows/scripts/setup_system.sh && .github/workflows/scripts/setup_system.sh
       - name: Convert MD to PDF
-        run: chmod +x .github/workflows/scripts/convert_md_to_pdf.sh && .github/workflows/scripts/convert_md_to_pdf.sh
       - name: Upload PDF artifacts
         uses: actions/upload-artifact@v4
         with:
           name: converted-pdfs
@@ -51,4 +100,10 @@ jobs:
       - name: Upload to Google Drive
         env:
           GOOGLE_OAUTH_TOKEN: ${{ secrets.GOOGLE_OAUTH_TOKEN }}
-        run: python .github/workflows/scripts/upload_to_drive.py

     runs-on: ubuntu-latest
     needs: check-file-size
     if: github.event_name == 'pull_request'
+    outputs:
+      skip_pdf: ${{ steps.check_md_changes.outputs.skip_pdf }}
     steps:
       - uses: actions/checkout@v3
         with:
           lfs: true
+          fetch-depth: 0  # Need full history for git diff
       - name: Pull LFS files
         run: |
           git lfs install
           git lfs pull
+      # NEW STEP: Check if MD files were modified
+      - name: Check for MD file changes
+        id: check_md_changes
+        run: |
+          echo "Checking for markdown file changes..."
+          # Get the commit message
+          COMMIT_MSG="${{ github.event.head_commit.message }}"
+          if [ -z "$COMMIT_MSG" ]; then
+            COMMIT_MSG=$(git log -1 --pretty=%B)
+          fi
+          echo "Commit message: $COMMIT_MSG"
+          # Check if commit message indicates MD changes
+          MD_IN_COMMIT=$(echo "$COMMIT_MSG" | grep -i "\.md\|markdown\|documentation\|docs\|readme" || true)
+          # Check if any MD files were actually modified in the diff
+          if [ "${{ github.event_name }}" = "pull_request" ]; then
+            MD_FILES_CHANGED=$(git diff --name-only origin/main...HEAD | grep "\.md$" || true)
+          else
+            MD_FILES_CHANGED=$(git diff --name-only HEAD~1 HEAD | grep "\.md$" || true)
+          fi
+          echo "MD files in commit message: $MD_IN_COMMIT"
+          echo "MD files changed: $MD_FILES_CHANGED"
+          # Skip PDF conversion if no MD files changed AND no MD-related keywords in commit
+          if [ -z "$MD_FILES_CHANGED" ] && [ -z "$MD_IN_COMMIT" ]; then
+            echo "skip_pdf=true" >> $GITHUB_OUTPUT
+            echo "Skipping PDF conversion - no MD files modified"
+          else
+            echo "skip_pdf=false" >> $GITHUB_OUTPUT
+            echo "MD files detected - will convert to PDF"
+            echo "Changed MD files: $MD_FILES_CHANGED"
+          fi
       - name: Setup Python
         uses: actions/setup-python@v4
         with:
           python-version: '3.11'
+      # CONDITIONAL STEP: Only run if MD files changed
       - name: Setup system dependencies
+        if: steps.check_md_changes.outputs.skip_pdf == 'false'
+        run: |
+          echo "Setting up system dependencies for PDF conversion..."
+          chmod +x .github/workflows/scripts/setup_system.sh && .github/workflows/scripts/setup_system.sh
+      # CONDITIONAL STEP: Only run if MD files changed
       - name: Convert MD to PDF
+        if: steps.check_md_changes.outputs.skip_pdf == 'false'
+        run: |
+          echo "Converting MD files to PDF..."
+          chmod +x .github/workflows/scripts/convert_md_to_pdf.sh && .github/workflows/scripts/convert_md_to_pdf.sh
+      # CONDITIONAL STEP: Only run if MD files changed
       - name: Upload PDF artifacts
+        if: steps.check_md_changes.outputs.skip_pdf == 'false'
         uses: actions/upload-artifact@v4
         with:
           name: converted-pdfs
       - name: Upload to Google Drive
         env:
           GOOGLE_OAUTH_TOKEN: ${{ secrets.GOOGLE_OAUTH_TOKEN }}
+        run: |
+          if [ "${{ steps.check_md_changes.outputs.skip_pdf }}" = "true" ]; then
+            echo "Skipped PDF conversion - uploading existing files only"
+          else
+            echo "Uploading files including new PDFs to Google Drive"
+          fi
+          python .github/workflows/scripts/upload_to_drive.py

.github/workflows/main.yml CHANGED Viewed

@@ -18,18 +18,57 @@ jobs:
           git lfs install
           git lfs pull
       - name: Setup Python
         uses: actions/setup-python@v4
         with:
           python-version: '3.11'
       - name: Setup system dependencies
-        run: chmod +x .github/workflows/scripts/setup_system.sh && .github/workflows/scripts/setup_system.sh
       - name: Convert MD to PDF
-        run: chmod +x .github/workflows/scripts/convert_md_to_pdf.sh && .github/workflows/scripts/convert_md_to_pdf.sh
       - name: Upload PDF artifacts
         uses: actions/upload-artifact@v4
         with:
           name: converted-pdfs
@@ -42,7 +81,13 @@ jobs:
           GOOGLE_OAUTH_TOKEN: ${{ secrets.GOOGLE_OAUTH_TOKEN }}
           # Fallback authentication method (Service Account)
           GOOGLE_SERVICE_ACCOUNT_KEY: ${{ secrets.GOOGLE_SERVICE_ACCOUNT_KEY }}
-        run: python .github/workflows/scripts/upload_to_drive.py
       - name: Push to Hugging Face hub
         env:

           git lfs install
           git lfs pull
+      # NEW STEP: Check for MD file changes
+      - name: Check for MD file changes
+        id: check_md_changes
+        run: |
+          echo "Checking for markdown file changes..."
+          # Get the commit message
+          COMMIT_MSG=$(git log -1 --pretty=%B)
+          echo "Commit message: $COMMIT_MSG"
+          # Check if commit message indicates MD changes
+          MD_IN_COMMIT=$(echo "$COMMIT_MSG" | grep -i "\.md\|markdown\|documentation\|docs\|readme" || true)
+          # Check if any MD files were actually modified in the last commit
+          MD_FILES_CHANGED=$(git diff --name-only HEAD~1 HEAD | grep "\.md$" || true)
+          echo "MD files in commit message: $MD_IN_COMMIT"
+          echo "MD files changed: $MD_FILES_CHANGED"
+          # Skip PDF conversion if no MD files changed AND no MD-related keywords in commit
+          if [ -z "$MD_FILES_CHANGED" ] && [ -z "$MD_IN_COMMIT" ]; then
+            echo "skip_pdf=true" >> $GITHUB_OUTPUT
+            echo "Skipping PDF conversion - no MD files modified"
+          else
+            echo "skip_pdf=false" >> $GITHUB_OUTPUT
+            echo "MD files detected - will convert to PDF"
+            echo "Changed MD files: $MD_FILES_CHANGED"
+          fi
       - name: Setup Python
         uses: actions/setup-python@v4
         with:
           python-version: '3.11'
+      # CONDITIONAL STEP: Only run if MD files changed
       - name: Setup system dependencies
+        if: steps.check_md_changes.outputs.skip_pdf == 'false'
+        run: |
+          echo "Setting up system dependencies for PDF conversion..."
+          chmod +x .github/workflows/scripts/setup_system.sh && .github/workflows/scripts/setup_system.sh
+      # CONDITIONAL STEP: Only run if MD files changed
       - name: Convert MD to PDF
+        if: steps.check_md_changes.outputs.skip_pdf == 'false'
+        run: |
+          echo "Converting MD files to PDF..."
+          chmod +x .github/workflows/scripts/convert_md_to_pdf.sh && .github/workflows/scripts/convert_md_to_pdf.sh
+      # CONDITIONAL STEP: Only run if MD files changed
       - name: Upload PDF artifacts
+        if: steps.check_md_changes.outputs.skip_pdf == 'false'
         uses: actions/upload-artifact@v4
         with:
           name: converted-pdfs
           GOOGLE_OAUTH_TOKEN: ${{ secrets.GOOGLE_OAUTH_TOKEN }}
           # Fallback authentication method (Service Account)
           GOOGLE_SERVICE_ACCOUNT_KEY: ${{ secrets.GOOGLE_SERVICE_ACCOUNT_KEY }}
+        run: |
+          if [ "${{ steps.check_md_changes.outputs.skip_pdf }}" = "true" ]; then
+            echo "Skipped PDF conversion - uploading existing files only"
+          else
+            echo "Uploading files including new PDFs to Google Drive"
+          fi
+          python .github/workflows/scripts/upload_to_drive.py
       - name: Push to Hugging Face hub
         env:

Dockerfile CHANGED Viewed

@@ -26,6 +26,7 @@ RUN apt-get update && apt-get install -y \
     libavformat-dev \
     libavutil-dev \
     libswresample-dev \
     && rm -rf /var/lib/apt/lists/*
 # Copy requirements first for better caching
@@ -33,6 +34,11 @@ COPY requirements.txt .
 # Install Python dependencies with proper error handling
 RUN pip install --no-cache-dir --upgrade pip setuptools wheel && \
     pip install --no-cache-dir -r requirements.txt
 # Copy application code

     libavformat-dev \
     libavutil-dev \
     libswresample-dev \
+    execstack \
     && rm -rf /var/lib/apt/lists/*
 # Copy requirements first for better caching
 # Install Python dependencies with proper error handling
 RUN pip install --no-cache-dir --upgrade pip setuptools wheel && \
+    # Install ONNX Runtime CPU version specifically
+    pip install --no-cache-dir onnxruntime==1.16.3 && \
+    # Fix executable stack issue
+    find /usr/local/lib/python*/site-packages/onnxruntime -name "*.so" -exec execstack -c {} \; 2>/dev/null || true && \
+    # Install other requirements
     pip install --no-cache-dir -r requirements.txt
 # Copy application code

model_preloader.py CHANGED Viewed

@@ -397,55 +397,85 @@ class ModelPreloader:
         except Exception as e:
             logger.warning(f"Error saving cache for {model_key}: {e}")
-    def load_pyannote_pipeline(self, task_id: str) -> Optional[Pipeline]:
         """Load pyannote speaker diarization pipeline with container-safe settings."""
         try:
             console.print(f"[yellow]Loading pyannote.audio pipeline...[/yellow]")
             # Check for HuggingFace token
-            hf_token = os.getenv('HUGGINGFACE_TOKEN')
             if not hf_token:
                 console.print("[red]Warning: HUGGINGFACE_TOKEN not found. Some models may not be accessible.[/red]")
-            # Container-safe pipeline loading with error suppression
-            with warnings.catch_warnings():
-                warnings.filterwarnings("ignore", category=UserWarning)
-                warnings.filterwarnings("ignore", message=".*executable stack.*")
                 pipeline = Pipeline.from_pretrained(
                     "pyannote/speaker-diarization-3.1",
                     use_auth_token=hf_token,
                     cache_dir=str(self.cache_dir / "pyannote")
                 )
-                # Force CPU execution
                 if hasattr(pipeline, '_models'):
                     for model_name, model in pipeline._models.items():
                         if hasattr(model, 'to'):
                             model.to('cpu')
-            console.print(f"[green]SUCCESS: pyannote.audio pipeline loaded successfully on {self.device}[/green]")
-            return pipeline
         except Exception as e:
-            # Check if it's the expected ONNX Runtime warning
-            if "executable stack" in str(e).lower():
-                console.print("[yellow]ONNX Runtime executable stack warning (expected in containers) - continuing...[/yellow]")
-                # Try alternative loading method
                 try:
-                    import warnings
-                    with warnings.catch_warnings():
-                        warnings.simplefilter("ignore")
-                        pipeline = Pipeline.from_pretrained(
-                            "pyannote/speaker-diarization-3.1",
-                            use_auth_token=hf_token,
-                            cache_dir=str(self.cache_dir / "pyannote")
-                        )
-                        return pipeline
-                except:
-                    pass
-            console.print(f"[red]ERROR: Failed to load pyannote.audio pipeline: {e}[/red]")
             logger.error(f"Pyannote loading failed: {e}")
             return None

         except Exception as e:
             logger.warning(f"Error saving cache for {model_key}: {e}")
+    def load_pyannote_pipeline(self) -> Optional[Pipeline]:
         """Load pyannote speaker diarization pipeline with container-safe settings."""
         try:
             console.print(f"[yellow]Loading pyannote.audio pipeline...[/yellow]")
+            # Fix ONNX Runtime libraries first
+            try:
+                import subprocess
+                subprocess.run([
+                    'find', '/usr/local/lib/python*/site-packages/onnxruntime',
+                    '-name', '*.so', '-exec', 'execstack', '-c', '{}', ';'
+                ], capture_output=True, timeout=10, stderr=subprocess.DEVNULL)
+            except:
+                pass
             # Check for HuggingFace token
+            hf_token = os.getenv('HUGGINGFACE_TOKEN') or os.getenv('HF_TOKEN')
             if not hf_token:
                 console.print("[red]Warning: HUGGINGFACE_TOKEN not found. Some models may not be accessible.[/red]")
+            # Suppress all warnings during pipeline loading
+            import warnings
+            import logging
+            # Temporarily disable all warnings and logging
+            old_warning_filters = warnings.filters[:]
+            warnings.filterwarnings("ignore")
+            # Disable ONNX Runtime logging
+            os.environ['ORT_LOGGING_LEVEL'] = '3'  # ERROR only
+            # Disable other verbose logging
+            logging.getLogger('onnxruntime').setLevel(logging.ERROR)
+            logging.getLogger('transformers').setLevel(logging.ERROR)
+            try:
                 pipeline = Pipeline.from_pretrained(
                     "pyannote/speaker-diarization-3.1",
                     use_auth_token=hf_token,
                     cache_dir=str(self.cache_dir / "pyannote")
                 )
+                # Force CPU execution for all models in pipeline
                 if hasattr(pipeline, '_models'):
                     for model_name, model in pipeline._models.items():
                         if hasattr(model, 'to'):
                             model.to('cpu')
+                console.print(f"[green]SUCCESS: pyannote.audio pipeline loaded successfully on CPU[/green]")
+                return pipeline
+            finally:
+                # Restore warning filters
+                warnings.filters[:] = old_warning_filters
         except Exception as e:
+            error_msg = str(e).lower()
+            if "executable stack" in error_msg or "onnxruntime" in error_msg:
+                console.print("[yellow]ONNX Runtime container warning (attempting workaround)...[/yellow]")
+                # Try alternative approach - load without ONNX-dependent components
                 try:
+                    # Try loading with CPU-only execution providers
+                    import onnxruntime as ort
+                    ort.set_default_logger_severity(4)  # FATAL only
+                    pipeline = Pipeline.from_pretrained(
+                        "pyannote/speaker-diarization-3.1",
+                        use_auth_token=hf_token,
+                        cache_dir=str(self.cache_dir / "pyannote")
+                    )
+                    console.print(f"[green]SUCCESS: pyannote.audio loaded with workaround[/green]")
+                    return pipeline
+                except Exception as e2:
+                    console.print(f"[red]ERROR: All pyannote loading methods failed: {e2}[/red]")
+            else:
+                console.print(f"[red]ERROR: Failed to load pyannote.audio pipeline: {e}[/red]")
             logger.error(f"Pyannote loading failed: {e}")
             return None

startup.py CHANGED Viewed

@@ -70,15 +70,33 @@ def preload_models():
         import model_preloader
         logger.info('✅ Model preloader module found')
-        # Set environment variables to handle onnxruntime issues
         env = os.environ.copy()
         env.update({
             'ORT_DYLIB_DEFAULT_OPTIONS': 'DisableExecutablePageAllocator=1',
             'ONNXRUNTIME_EXECUTION_PROVIDERS': 'CPUExecutionProvider',
             'TF_ENABLE_ONEDNN_OPTS': '0',
-            'OMP_NUM_THREADS': '1'
         })
         # Try to run the preloader
         result = subprocess.run(
             ['python', 'model_preloader.py'],
@@ -96,18 +114,22 @@ def preload_models():
         else:
             logger.warning(f'⚠️ Model preloading failed with return code {result.returncode}')
             if result.stderr:
-                # Check if it's the onnxruntime issue
-                if 'cannot enable executable stack' in result.stderr:
-                    logger.warning('⚠️ ONNX Runtime executable stack issue detected - this is expected in containers')
-                else:
-                    logger.warning(f'Preloader stderr: {result.stderr[:500]}...')
             return False
     except subprocess.TimeoutExpired:
         logger.warning('⚠️ Model preloading timed out, continuing...')
         return False
     except Exception as e:
-        logger.warning(f'⚠️ Model preloading failed: {e}')
         return False
 def start_web_app():

         import model_preloader
         logger.info('✅ Model preloader module found')
+        # Set comprehensive environment variables for ONNX Runtime
         env = os.environ.copy()
         env.update({
             'ORT_DYLIB_DEFAULT_OPTIONS': 'DisableExecutablePageAllocator=1',
             'ONNXRUNTIME_EXECUTION_PROVIDERS': 'CPUExecutionProvider',
+            'ORT_DISABLE_TLS_ARENA': '1',
             'TF_ENABLE_ONEDNN_OPTS': '0',
+            'OMP_NUM_THREADS': '1',
+            'MKL_NUM_THREADS': '1',
+            'NUMBA_NUM_THREADS': '1',
+            'TOKENIZERS_PARALLELISM': 'false',
+            'MALLOC_ARENA_MAX': '2',
+            # Additional ONNX Runtime fixes
+            'ONNXRUNTIME_LOG_SEVERITY_LEVEL': '3',
+            'ORT_LOGGING_LEVEL': 'WARNING'
         })
+        # Try to fix ONNX Runtime libraries before running preloader
+        try:
+            import subprocess
+            subprocess.run([
+                'find', '/usr/local/lib/python*/site-packages/onnxruntime',
+                '-name', '*.so', '-exec', 'execstack', '-c', '{}', ';'
+            ], capture_output=True, timeout=30)
+        except:
+            pass  # Continue if execstack fix fails
         # Try to run the preloader
         result = subprocess.run(
             ['python', 'model_preloader.py'],
         else:
             logger.warning(f'⚠️ Model preloading failed with return code {result.returncode}')
             if result.stderr:
+                # Filter out expected ONNX warnings
+                stderr_lines = result.stderr.split('\n')
+                important_errors = [line for line in stderr_lines
+                                  if 'executable stack' not in line.lower()
+                                  and 'onnxruntime' not in line.lower()
+                                  and line.strip()]
+                if important_errors:
+                    logger.warning(f'Important errors: {important_errors[:3]}')
             return False
     except subprocess.TimeoutExpired:
         logger.warning('⚠️ Model preloading timed out, continuing...')
         return False
     except Exception as e:
+        if 'executable stack' not in str(e).lower():
+            logger.warning(f'⚠️ Model preloading failed: {e}')
         return False
 def start_web_app():