Spaces:
Build error
Build error
Upload 9 files
Browse files- Dockerfile +128 -0
- app.py +725 -0
- chunker_pipeline.py +1582 -0
- config.yaml +324 -0
- custom_recursive_chunker.py +366 -0
- deployment_instructions.md +154 -0
- guide_deploiement_hf.md +574 -0
- requirements.txt +126 -0
- schemas.py +234 -0
Dockerfile
ADDED
|
@@ -0,0 +1,128 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Dockerfile pour Smart Chunker API v4.0
|
| 2 |
+
# Compatible HuggingFace Spaces + Pipeline complet
|
| 3 |
+
# Version finale corrigée
|
| 4 |
+
|
| 5 |
+
# ===================================
|
| 6 |
+
# IMAGE DE BASE OPTIMISÉE
|
| 7 |
+
# ===================================
|
| 8 |
+
|
| 9 |
+
FROM python:3.10-slim
|
| 10 |
+
|
| 11 |
+
# ===================================
|
| 12 |
+
# MÉTADONNÉES
|
| 13 |
+
# ===================================
|
| 14 |
+
|
| 15 |
+
LABEL maintainer="Smart Chunker Pipeline v4.0"
|
| 16 |
+
LABEL description="API de chunking sémantique intelligent récursif"
|
| 17 |
+
LABEL version="4.0.0"
|
| 18 |
+
|
| 19 |
+
# ===================================
|
| 20 |
+
# VARIABLES D'ENVIRONNEMENT
|
| 21 |
+
# ===================================
|
| 22 |
+
|
| 23 |
+
# Configuration Python
|
| 24 |
+
ENV PYTHONUNBUFFERED=1
|
| 25 |
+
ENV PYTHONDONTWRITEBYTECODE=1
|
| 26 |
+
ENV PYTHONIOENCODING=utf-8
|
| 27 |
+
|
| 28 |
+
# Configuration HuggingFace pour Spaces
|
| 29 |
+
ENV HF_HOME=/tmp/huggingface
|
| 30 |
+
ENV TRANSFORMERS_CACHE=/tmp/transformers
|
| 31 |
+
ENV HF_HUB_CACHE=/tmp/hub
|
| 32 |
+
ENV TOKENIZERS_PARALLELISM=false
|
| 33 |
+
ENV HF_HUB_DISABLE_PROGRESS_BARS=1
|
| 34 |
+
ENV TRANSFORMERS_VERBOSITY=error
|
| 35 |
+
|
| 36 |
+
# Configuration FastAPI/Uvicorn
|
| 37 |
+
ENV PORT=7860
|
| 38 |
+
ENV HOST=0.0.0.0
|
| 39 |
+
ENV WORKERS=1
|
| 40 |
+
|
| 41 |
+
# Optimisations performance
|
| 42 |
+
ENV OMP_NUM_THREADS=1
|
| 43 |
+
ENV OPENBLAS_NUM_THREADS=1
|
| 44 |
+
ENV MKL_NUM_THREADS=1
|
| 45 |
+
|
| 46 |
+
# ===================================
|
| 47 |
+
# INSTALLATION DÉPENDANCES SYSTÈME
|
| 48 |
+
# ===================================
|
| 49 |
+
|
| 50 |
+
RUN apt-get update && apt-get install -y \
|
| 51 |
+
build-essential \
|
| 52 |
+
git \
|
| 53 |
+
curl \
|
| 54 |
+
&& rm -rf /var/lib/apt/lists/* \
|
| 55 |
+
&& apt-get clean
|
| 56 |
+
|
| 57 |
+
# ===================================
|
| 58 |
+
# CRÉATION RÉPERTOIRE TRAVAIL
|
| 59 |
+
# ===================================
|
| 60 |
+
|
| 61 |
+
WORKDIR /app
|
| 62 |
+
|
| 63 |
+
# ===================================
|
| 64 |
+
# CRÉATION DOSSIERS CACHE
|
| 65 |
+
# ===================================
|
| 66 |
+
|
| 67 |
+
RUN mkdir -p /tmp/huggingface \
|
| 68 |
+
&& mkdir -p /tmp/transformers \
|
| 69 |
+
&& mkdir -p /tmp/hub \
|
| 70 |
+
&& mkdir -p /tmp/llm \
|
| 71 |
+
&& mkdir -p /tmp/embeddings \
|
| 72 |
+
&& mkdir -p /tmp/logs \
|
| 73 |
+
&& chmod -R 755 /tmp
|
| 74 |
+
|
| 75 |
+
# ===================================
|
| 76 |
+
# COPIE FICHIERS CONFIGURATION
|
| 77 |
+
# ===================================
|
| 78 |
+
|
| 79 |
+
# Copie requirements en premier pour cache Docker
|
| 80 |
+
COPY requirements.txt .
|
| 81 |
+
|
| 82 |
+
# ===================================
|
| 83 |
+
# INSTALLATION DÉPENDANCES PYTHON
|
| 84 |
+
# ===================================
|
| 85 |
+
|
| 86 |
+
# Mise à jour pip
|
| 87 |
+
RUN pip install --no-cache-dir --upgrade pip
|
| 88 |
+
|
| 89 |
+
# Installation des dépendances avec cache nettoyé
|
| 90 |
+
RUN rm -rf /root/.cache/pip \
|
| 91 |
+
&& pip install --no-cache-dir -r requirements.txt
|
| 92 |
+
|
| 93 |
+
# ===================================
|
| 94 |
+
# COPIE CODE APPLICATION
|
| 95 |
+
# ===================================
|
| 96 |
+
|
| 97 |
+
# Copie tous les fichiers Python
|
| 98 |
+
COPY *.py .
|
| 99 |
+
COPY config.yaml .
|
| 100 |
+
|
| 101 |
+
# ===================================
|
| 102 |
+
# CONFIGURATION PERMISSIONS
|
| 103 |
+
# ===================================
|
| 104 |
+
|
| 105 |
+
# S'assurer que les dossiers sont accessibles
|
| 106 |
+
RUN chmod -R 755 /app \
|
| 107 |
+
&& chmod -R 777 /tmp
|
| 108 |
+
|
| 109 |
+
# ===================================
|
| 110 |
+
# VÉRIFICATION SANTÉ
|
| 111 |
+
# ===================================
|
| 112 |
+
|
| 113 |
+
# Healthcheck pour vérifier que l'API répond
|
| 114 |
+
HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \
|
| 115 |
+
CMD curl -f http://localhost:${PORT}/health || exit 1
|
| 116 |
+
|
| 117 |
+
# ===================================
|
| 118 |
+
# EXPOSITION PORT
|
| 119 |
+
# ===================================
|
| 120 |
+
|
| 121 |
+
EXPOSE ${PORT}
|
| 122 |
+
|
| 123 |
+
# ===================================
|
| 124 |
+
# COMMANDE DE DÉMARRAGE
|
| 125 |
+
# ===================================
|
| 126 |
+
|
| 127 |
+
# Commande par défaut pour démarrer l'application
|
| 128 |
+
CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860", "--workers", "1"]
|
app.py
ADDED
|
@@ -0,0 +1,725 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
app.py v4.0 FINAL - FastAPI pour Chunking Sémantique Intelligent
|
| 3 |
+
|
| 4 |
+
CORRECTIONS ET AMÉLIORATIONS:
|
| 5 |
+
✅ Import SmartChunkerPipeline (correct)
|
| 6 |
+
✅ Méthodes synchronisées avec chunker_pipeline.py
|
| 7 |
+
✅ Gestion d'erreurs robuste
|
| 8 |
+
✅ Endpoints optimisés pour n8n
|
| 9 |
+
✅ Variables d'environnement sécurisées
|
| 10 |
+
✅ Monitoring et health checks complets
|
| 11 |
+
✅ Configuration HF Space gratuit optimisée
|
| 12 |
+
"""
|
| 13 |
+
|
| 14 |
+
import os
|
| 15 |
+
import tempfile
|
| 16 |
+
|
| 17 |
+
import logging
|
| 18 |
+
import time
|
| 19 |
+
import asyncio
|
| 20 |
+
import gc
|
| 21 |
+
from pathlib import Path
|
| 22 |
+
from fastapi import FastAPI, HTTPException, Request
|
| 23 |
+
from fastapi.middleware.cors import CORSMiddleware
|
| 24 |
+
from fastapi.responses import JSONResponse
|
| 25 |
+
from pydantic import BaseModel, Field
|
| 26 |
+
from typing import List, Dict, Any, Optional
|
| 27 |
+
from concurrent.futures import ThreadPoolExecutor
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
import os
|
| 31 |
+
|
| 32 |
+
#os.environ["HF_HOME"] = "/tmp/cache/huggingface"
|
| 33 |
+
#os.environ["TRANSFORMERS_CACHE"] = "/tmp/cache/transformers"
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
os.environ["HF_HOME"] = "/tmp/hf"
|
| 37 |
+
os.environ["TRANSFORMERS_CACHE"] = "/tmp/hf"
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
# Configuration logging optimisée
|
| 43 |
+
logging.basicConfig(
|
| 44 |
+
level=logging.INFO,
|
| 45 |
+
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
|
| 46 |
+
handlers=[
|
| 47 |
+
logging.StreamHandler(),
|
| 48 |
+
logging.FileHandler("/app/logs/app.log", mode="a") if os.path.exists("/app/logs") else logging.StreamHandler()
|
| 49 |
+
]
|
| 50 |
+
)
|
| 51 |
+
logger = logging.getLogger(__name__)
|
| 52 |
+
|
| 53 |
+
# ✅ IMPORTS PRINCIPAUX - Vérification de compatibilité
|
| 54 |
+
try:
|
| 55 |
+
from chunker_pipeline import SmartChunkerPipeline
|
| 56 |
+
from schemas import ChunkRequest, ChunkResponse, ChunkMetadata
|
| 57 |
+
logger.info("✅ Modules chunking v4.0 importés avec succès")
|
| 58 |
+
except ImportError as e:
|
| 59 |
+
logger.error(f"❌ ERREUR CRITIQUE - Import modules chunking: {e}")
|
| 60 |
+
logger.error("Vérifiez que les fichiers chunker_pipeline.py et schemas.py existent")
|
| 61 |
+
raise
|
| 62 |
+
|
| 63 |
+
# ✅ CONFIGURATION ENVIRONNEMENT HF SPACE SÉCURISÉE
|
| 64 |
+
def setup_environment():
|
| 65 |
+
"""Configuration optimisée pour Hugging Face Space gratuit"""
|
| 66 |
+
|
| 67 |
+
# ✅ Compatible Hugging Face Space (car /tmp est accessible en écriture)
|
| 68 |
+
|
| 69 |
+
cache_base = os.path.join(tempfile.gettempdir(), "cache")
|
| 70 |
+
os.environ["HF_HOME"] = os.path.join(cache_base, "huggingface")
|
| 71 |
+
os.environ["TRANSFORMERS_CACHE"] = os.path.join(cache_base, "transformers")
|
| 72 |
+
os.environ["HF_HUB_CACHE"] = os.path.join(cache_base, "hub")
|
| 73 |
+
|
| 74 |
+
|
| 75 |
+
# Optimisations performance
|
| 76 |
+
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
| 77 |
+
os.environ["HF_HUB_DISABLE_PROGRESS_BARS"] = "1"
|
| 78 |
+
os.environ["TRANSFORMERS_VERBOSITY"] = "error"
|
| 79 |
+
os.environ["PYTHONUNBUFFERED"] = "1"
|
| 80 |
+
|
| 81 |
+
# Création dossiers cache sécurisés
|
| 82 |
+
cache_dirs = [
|
| 83 |
+
os.environ["HF_HOME"],
|
| 84 |
+
os.environ["TRANSFORMERS_CACHE"],
|
| 85 |
+
os.environ["HF_HUB_CACHE"],
|
| 86 |
+
os.path.join(cache_base, "llm"),
|
| 87 |
+
os.path.join(cache_base, "embeddings"),
|
| 88 |
+
os.path.join(cache_base, "logs")
|
| 89 |
+
]
|
| 90 |
+
|
| 91 |
+
|
| 92 |
+
|
| 93 |
+
for cache_dir in cache_dirs:
|
| 94 |
+
try:
|
| 95 |
+
os.makedirs(cache_dir, exist_ok=True)
|
| 96 |
+
os.chmod(cache_dir, 0o755)
|
| 97 |
+
except Exception as e:
|
| 98 |
+
logger.warning(f"⚠️ Impossible de créer {cache_dir}: {e}")
|
| 99 |
+
|
| 100 |
+
logger.info("✅ Environnement HF Space configuré")
|
| 101 |
+
|
| 102 |
+
# Configuration environnement
|
| 103 |
+
setup_environment()
|
| 104 |
+
|
| 105 |
+
# ✅ INITIALISATION FASTAPI OPTIMISÉE
|
| 106 |
+
app = FastAPI(
|
| 107 |
+
title="🧠 Chunking Sémantique Intelligent API",
|
| 108 |
+
description="""
|
| 109 |
+
**API de découpage récursif hiérarchique avec parentalité**
|
| 110 |
+
|
| 111 |
+
🚀 **Fonctionnalités:**
|
| 112 |
+
- Chunking sémantique avec Chonkie + LlamaIndex
|
| 113 |
+
- Relations bidirectionnelles parent/enfant
|
| 114 |
+
- Export Obsidian format [[Titre]], id
|
| 115 |
+
- Base connaissance pour agents IA spécialisés
|
| 116 |
+
- 100% gratuit sur HuggingFace Space
|
| 117 |
+
|
| 118 |
+
🔧 **Optimisé pour n8n et automation**
|
| 119 |
+
""",
|
| 120 |
+
version="4.0.0",
|
| 121 |
+
docs_url="/docs",
|
| 122 |
+
redoc_url="/redoc",
|
| 123 |
+
openapi_tags=[
|
| 124 |
+
{"name": "chunking", "description": "Endpoints de chunking principal"},
|
| 125 |
+
{"name": "monitoring", "description": "Santé et configuration"},
|
| 126 |
+
{"name": "test", "description": "Tests et validation"}
|
| 127 |
+
]
|
| 128 |
+
)
|
| 129 |
+
|
| 130 |
+
# ✅ CORS ÉTENDU POUR N8N ET INTÉGRATIONS
|
| 131 |
+
app.add_middleware(
|
| 132 |
+
CORSMiddleware,
|
| 133 |
+
allow_origins=["*"], # Nécessaire pour n8n
|
| 134 |
+
allow_credentials=True,
|
| 135 |
+
allow_methods=["GET", "POST", "PUT", "DELETE", "OPTIONS"],
|
| 136 |
+
allow_headers=["*"],
|
| 137 |
+
expose_headers=["*"]
|
| 138 |
+
)
|
| 139 |
+
|
| 140 |
+
# ✅ VARIABLES GLOBALES
|
| 141 |
+
pipeline = None
|
| 142 |
+
executor = ThreadPoolExecutor(max_workers=1) # HF Space gratuit = 1 worker max
|
| 143 |
+
startup_time = time.time()
|
| 144 |
+
request_count = 0
|
| 145 |
+
|
| 146 |
+
# ✅ MIDDLEWARE MONITORING ET SÉCURITÉ
|
| 147 |
+
@app.middleware("http")
|
| 148 |
+
async def monitoring_middleware(request: Request, call_next):
|
| 149 |
+
"""Middleware pour monitoring et gestion erreurs globales"""
|
| 150 |
+
global request_count
|
| 151 |
+
start_time = time.time()
|
| 152 |
+
request_count += 1
|
| 153 |
+
|
| 154 |
+
# Headers sécurité
|
| 155 |
+
response = None
|
| 156 |
+
try:
|
| 157 |
+
response = await call_next(request)
|
| 158 |
+
response.headers["X-API-Version"] = "4.0.0"
|
| 159 |
+
response.headers["X-Powered-By"] = "Chunking-Semantic-AI"
|
| 160 |
+
|
| 161 |
+
# Log performance
|
| 162 |
+
process_time = time.time() - start_time
|
| 163 |
+
if process_time > 5.0: # Log requêtes lentes
|
| 164 |
+
logger.warning(f"⚠️ Requête lente: {request.url.path} - {process_time:.2f}s")
|
| 165 |
+
|
| 166 |
+
return response
|
| 167 |
+
|
| 168 |
+
except Exception as e:
|
| 169 |
+
logger.error(f"❌ Erreur middleware {request.url.path}: {str(e)}")
|
| 170 |
+
|
| 171 |
+
# Réponse d'erreur structurée
|
| 172 |
+
return JSONResponse(
|
| 173 |
+
status_code=500,
|
| 174 |
+
content={
|
| 175 |
+
"error": "Erreur interne du serveur",
|
| 176 |
+
"detail": str(e),
|
| 177 |
+
"path": str(request.url.path),
|
| 178 |
+
"timestamp": time.time(),
|
| 179 |
+
"request_id": request_count,
|
| 180 |
+
"version": "4.0.0"
|
| 181 |
+
}
|
| 182 |
+
)
|
| 183 |
+
|
| 184 |
+
# ✅ ÉVÉNEMENTS LIFECYCLE
|
| 185 |
+
@app.on_event("startup")
|
| 186 |
+
async def startup_event():
|
| 187 |
+
"""Initialisation complète au démarrage"""
|
| 188 |
+
global pipeline
|
| 189 |
+
|
| 190 |
+
try:
|
| 191 |
+
logger.info("🚀 === DÉMARRAGE API CHUNKING SÉMANTIQUE v4.0 ===")
|
| 192 |
+
|
| 193 |
+
# Vérification espace disque
|
| 194 |
+
import shutil
|
| 195 |
+
total, used, free = shutil.disk_usage("/app")
|
| 196 |
+
free_gb = free / (1024**3)
|
| 197 |
+
logger.info(f"💾 Espace libre: {free_gb:.1f}GB")
|
| 198 |
+
|
| 199 |
+
if free_gb < 1.0:
|
| 200 |
+
logger.warning("⚠️ Espace disque faible (<1GB)")
|
| 201 |
+
|
| 202 |
+
# Initialisation pipeline principal
|
| 203 |
+
logger.info("🔧 Initialisation SmartChunkerPipeline...")
|
| 204 |
+
pipeline = SmartChunkerPipeline()
|
| 205 |
+
await pipeline.initialize()
|
| 206 |
+
|
| 207 |
+
# Vérification santé
|
| 208 |
+
health = await pipeline.health_check_v4()
|
| 209 |
+
logger.info(f"🏥 Status santé: {health['status']}")
|
| 210 |
+
|
| 211 |
+
if health['status'] != 'healthy':
|
| 212 |
+
logger.warning(f"⚠️ Pipeline en mode dégradé: {health['status']}")
|
| 213 |
+
|
| 214 |
+
# Configuration système
|
| 215 |
+
config_info = await pipeline.get_config_info_v4()
|
| 216 |
+
logger.info(f"🧠 LLM: {config_info['models']['llm_model']}")
|
| 217 |
+
logger.info(f"🔤 Embedding: {config_info['models']['embedding_model']}")
|
| 218 |
+
logger.info(f"🦛 Chonkie: {'✅' if config_info['models']['chonkie_available'] else '❌'}")
|
| 219 |
+
|
| 220 |
+
# Test rapide de fonctionnement
|
| 221 |
+
test_request = ChunkRequest(
|
| 222 |
+
text="Test d'initialisation du système de chunking.",
|
| 223 |
+
titre="Test Init",
|
| 224 |
+
source_id="init_test"
|
| 225 |
+
)
|
| 226 |
+
|
| 227 |
+
test_result = await pipeline.process_text(test_request)
|
| 228 |
+
logger.info(f"✅ Test init: {test_result.total_chunks} chunks générés")
|
| 229 |
+
|
| 230 |
+
logger.info("🎉 API Chunking Sémantique v4.0 prête !")
|
| 231 |
+
|
| 232 |
+
except Exception as e:
|
| 233 |
+
logger.error(f"❌ ERREUR CRITIQUE lors du démarrage: {e}")
|
| 234 |
+
logger.error("Le service ne pourra pas fonctionner correctement")
|
| 235 |
+
raise
|
| 236 |
+
|
| 237 |
+
@app.on_event("shutdown")
|
| 238 |
+
async def shutdown_event():
|
| 239 |
+
"""Nettoyage propre à l'arrêt"""
|
| 240 |
+
global pipeline, executor
|
| 241 |
+
|
| 242 |
+
try:
|
| 243 |
+
logger.info("🛑 Arrêt du service en cours...")
|
| 244 |
+
|
| 245 |
+
# Nettoyage pipeline
|
| 246 |
+
if pipeline:
|
| 247 |
+
await pipeline.cleanup()
|
| 248 |
+
logger.info("✅ Pipeline nettoyé")
|
| 249 |
+
|
| 250 |
+
# Nettoyage executor
|
| 251 |
+
if executor:
|
| 252 |
+
executor.shutdown(wait=True, timeout=10)
|
| 253 |
+
logger.info("✅ Executor fermé")
|
| 254 |
+
|
| 255 |
+
# Nettoyage mémoire final
|
| 256 |
+
gc.collect()
|
| 257 |
+
|
| 258 |
+
# Statistiques finales
|
| 259 |
+
uptime = time.time() - startup_time
|
| 260 |
+
logger.info(f"📊 Statistiques finales:")
|
| 261 |
+
logger.info(f" - Temps de fonctionnement: {uptime:.1f}s")
|
| 262 |
+
logger.info(f" - Requêtes traitées: {request_count}")
|
| 263 |
+
logger.info(f" - Moyenne: {request_count/uptime:.2f} req/s")
|
| 264 |
+
|
| 265 |
+
logger.info("✅ Arrêt propre terminé")
|
| 266 |
+
|
| 267 |
+
except Exception as e:
|
| 268 |
+
logger.error(f"⚠️ Erreur lors de l'arrêt: {e}")
|
| 269 |
+
|
| 270 |
+
# ✅ ENDPOINTS PRINCIPAUX
|
| 271 |
+
|
| 272 |
+
@app.get("/", tags=["monitoring"])
|
| 273 |
+
async def root():
|
| 274 |
+
"""Page d'accueil avec informations complètes du service"""
|
| 275 |
+
uptime = time.time() - startup_time
|
| 276 |
+
|
| 277 |
+
return {
|
| 278 |
+
"service": "🧠 Chunking Sémantique Intelligent API",
|
| 279 |
+
"version": "4.0.0",
|
| 280 |
+
"status": "🟢 Opérationnel" if pipeline else "🔴 Non initialisé",
|
| 281 |
+
"uptime_seconds": round(uptime, 1),
|
| 282 |
+
"requests_processed": request_count,
|
| 283 |
+
|
| 284 |
+
"features": [
|
| 285 |
+
"🧩 Chunking sémantique avec Chonkie",
|
| 286 |
+
"🏗️ Hiérarchie récursive intelligente",
|
| 287 |
+
"🔗 Relations bidirectionnelles parent/enfant",
|
| 288 |
+
"📝 Export Obsidian format [[Titre]], id",
|
| 289 |
+
"🤖 Base connaissance pour agents IA spécialisés",
|
| 290 |
+
"💰 100% gratuit sur HuggingFace Space",
|
| 291 |
+
"🔄 Optimisé pour n8n et automation"
|
| 292 |
+
],
|
| 293 |
+
|
| 294 |
+
"endpoints": {
|
| 295 |
+
"chunking": [
|
| 296 |
+
"POST /chunk - Chunking principal",
|
| 297 |
+
"POST /chunk-batch - Traitement par lots"
|
| 298 |
+
],
|
| 299 |
+
"monitoring": [
|
| 300 |
+
"GET /health - Vérification santé détaillée",
|
| 301 |
+
"GET /config - Configuration système",
|
| 302 |
+
"GET /stats - Statistiques d'usage"
|
| 303 |
+
],
|
| 304 |
+
"test": [
|
| 305 |
+
"POST /test - Test de validation",
|
| 306 |
+
"GET /ping - Test connectivité simple"
|
| 307 |
+
]
|
| 308 |
+
},
|
| 309 |
+
|
| 310 |
+
"documentation": {
|
| 311 |
+
"interactive": "/docs",
|
| 312 |
+
"redoc": "/redoc"
|
| 313 |
+
},
|
| 314 |
+
|
| 315 |
+
"support": {
|
| 316 |
+
"n8n_compatible": True,
|
| 317 |
+
"max_text_length": "500,000 caractères",
|
| 318 |
+
"max_batch_size": 3,
|
| 319 |
+
"response_format": "JSON structuré"
|
| 320 |
+
}
|
| 321 |
+
}
|
| 322 |
+
|
| 323 |
+
@app.get("/health", tags=["monitoring"])
|
| 324 |
+
async def health_check():
|
| 325 |
+
"""Vérification santé complète et détaillée"""
|
| 326 |
+
try:
|
| 327 |
+
if pipeline is None:
|
| 328 |
+
return {
|
| 329 |
+
"status": "🔴 error",
|
| 330 |
+
"message": "Pipeline non initialisé",
|
| 331 |
+
"version": "4.0.0",
|
| 332 |
+
"timestamp": time.time(),
|
| 333 |
+
"uptime": time.time() - startup_time,
|
| 334 |
+
"critical": True
|
| 335 |
+
}
|
| 336 |
+
|
| 337 |
+
# Health check pipeline
|
| 338 |
+
health_result = await pipeline.health_check_v4()
|
| 339 |
+
|
| 340 |
+
# Informations mémoire
|
| 341 |
+
memory_info = pipeline.get_memory_usage_v4()
|
| 342 |
+
|
| 343 |
+
# Statistiques système
|
| 344 |
+
import psutil
|
| 345 |
+
try:
|
| 346 |
+
cpu_percent = psutil.cpu_percent(interval=1)
|
| 347 |
+
memory_percent = psutil.virtual_memory().percent
|
| 348 |
+
except:
|
| 349 |
+
cpu_percent = 0
|
| 350 |
+
memory_percent = 0
|
| 351 |
+
|
| 352 |
+
# Status coloré
|
| 353 |
+
status_map = {
|
| 354 |
+
"healthy": "🟢 healthy",
|
| 355 |
+
"degraded": "🟡 degraded",
|
| 356 |
+
"unhealthy": "🔴 unhealthy",
|
| 357 |
+
"error": "🔴 error"
|
| 358 |
+
}
|
| 359 |
+
|
| 360 |
+
return {
|
| 361 |
+
**health_result,
|
| 362 |
+
"status": status_map.get(health_result['status'], health_result['status']),
|
| 363 |
+
"memory_info": memory_info,
|
| 364 |
+
"system_info": {
|
| 365 |
+
"cpu_percent": cpu_percent,
|
| 366 |
+
"memory_percent": memory_percent,
|
| 367 |
+
"uptime": time.time() - startup_time,
|
| 368 |
+
"requests_processed": request_count
|
| 369 |
+
},
|
| 370 |
+
"version": "4.0.0"
|
| 371 |
+
}
|
| 372 |
+
|
| 373 |
+
except Exception as e:
|
| 374 |
+
logger.error(f"❌ Erreur health check: {e}")
|
| 375 |
+
return {
|
| 376 |
+
"status": "🔴 error",
|
| 377 |
+
"message": f"Erreur health check: {str(e)}",
|
| 378 |
+
"version": "4.0.0",
|
| 379 |
+
"timestamp": time.time(),
|
| 380 |
+
"critical": True
|
| 381 |
+
}
|
| 382 |
+
|
| 383 |
+
@app.get("/config", tags=["monitoring"])
|
| 384 |
+
async def get_config():
|
| 385 |
+
"""Configuration système détaillée"""
|
| 386 |
+
try:
|
| 387 |
+
if pipeline is None:
|
| 388 |
+
raise HTTPException(status_code=503, detail="Pipeline non initialisé")
|
| 389 |
+
|
| 390 |
+
config_info = await pipeline.get_config_info_v4()
|
| 391 |
+
|
| 392 |
+
# Ajout informations runtime
|
| 393 |
+
runtime_info = {
|
| 394 |
+
"python_version": f"{os.sys.version_info.major}.{os.sys.version_info.minor}.{os.sys.version_info.micro}",
|
| 395 |
+
"platform": os.name,
|
| 396 |
+
"workers": 1,
|
| 397 |
+
"max_request_size": "500KB",
|
| 398 |
+
"cache_enabled": True,
|
| 399 |
+
"environment": "HuggingFace Space"
|
| 400 |
+
}
|
| 401 |
+
|
| 402 |
+
return {
|
| 403 |
+
**config_info,
|
| 404 |
+
"runtime_info": runtime_info,
|
| 405 |
+
"api_version": "4.0.0",
|
| 406 |
+
"timestamp": time.time()
|
| 407 |
+
}
|
| 408 |
+
|
| 409 |
+
except Exception as e:
|
| 410 |
+
logger.error(f"❌ Erreur récupération config: {e}")
|
| 411 |
+
raise HTTPException(status_code=500, detail=f"Erreur config: {str(e)}")
|
| 412 |
+
|
| 413 |
+
@app.get("/stats", tags=["monitoring"])
|
| 414 |
+
async def get_stats():
|
| 415 |
+
"""Statistiques d'usage détaillées"""
|
| 416 |
+
uptime = time.time() - startup_time
|
| 417 |
+
avg_requests_per_minute = (request_count / uptime) * 60 if uptime > 0 else 0
|
| 418 |
+
|
| 419 |
+
return {
|
| 420 |
+
"service_stats": {
|
| 421 |
+
"uptime_seconds": round(uptime, 1),
|
| 422 |
+
"uptime_formatted": f"{int(uptime//3600)}h {int((uptime%3600)//60)}m {int(uptime%60)}s",
|
| 423 |
+
"total_requests": request_count,
|
| 424 |
+
"avg_requests_per_minute": round(avg_requests_per_minute, 2)
|
| 425 |
+
},
|
| 426 |
+
"system_health": {
|
| 427 |
+
"pipeline_initialized": pipeline is not None,
|
| 428 |
+
"memory_usage": pipeline.get_memory_usage_v4() if pipeline else "N/A"
|
| 429 |
+
},
|
| 430 |
+
"version": "4.0.0",
|
| 431 |
+
"timestamp": time.time()
|
| 432 |
+
}
|
| 433 |
+
|
| 434 |
+
@app.post("/chunk", response_model=ChunkResponse, tags=["chunking"])
|
| 435 |
+
async def chunk_text(request: ChunkRequest):
|
| 436 |
+
"""
|
| 437 |
+
🧠 ENDPOINT PRINCIPAL - Chunking sémantique intelligent
|
| 438 |
+
|
| 439 |
+
**Fonctionnalités:**
|
| 440 |
+
- Chunking sémantique avec Chonkie + LlamaIndex
|
| 441 |
+
- Relations hiérarchiques bidirectionnelles
|
| 442 |
+
- Export Obsidian format [[Titre]], id
|
| 443 |
+
- Base connaissance pour agents IA
|
| 444 |
+
|
| 445 |
+
**Optimisé pour n8n et automation**
|
| 446 |
+
"""
|
| 447 |
+
if pipeline is None:
|
| 448 |
+
raise HTTPException(
|
| 449 |
+
status_code=503,
|
| 450 |
+
detail="❌ Pipeline non initialisé - Redémarrez le service"
|
| 451 |
+
)
|
| 452 |
+
|
| 453 |
+
start_time = time.time()
|
| 454 |
+
|
| 455 |
+
try:
|
| 456 |
+
logger.info(f"📝 Début chunking: {request.titre or 'Sans titre'} ({len(request.text)} chars)")
|
| 457 |
+
|
| 458 |
+
# Validation entrées renforcée
|
| 459 |
+
if not request.text or len(request.text.strip()) < 10:
|
| 460 |
+
raise HTTPException(
|
| 461 |
+
status_code=400,
|
| 462 |
+
detail="❌ Le texte doit contenir au moins 10 caractères"
|
| 463 |
+
)
|
| 464 |
+
|
| 465 |
+
# Limite HF Space gratuit
|
| 466 |
+
max_length = 500000
|
| 467 |
+
if len(request.text) > max_length:
|
| 468 |
+
raise HTTPException(
|
| 469 |
+
status_code=400,
|
| 470 |
+
detail=f"❌ Texte trop long ({len(request.text)} chars). Maximum: {max_length:,} caractères"
|
| 471 |
+
)
|
| 472 |
+
|
| 473 |
+
# Traitement principal
|
| 474 |
+
result = await pipeline.process_text(request)
|
| 475 |
+
|
| 476 |
+
processing_time = time.time() - start_time
|
| 477 |
+
|
| 478 |
+
# Log succès
|
| 479 |
+
logger.info(
|
| 480 |
+
f"✅ Chunking terminé: {result.total_chunks} chunks, "
|
| 481 |
+
f"{result.total_tokens} tokens en {processing_time:.2f}s"
|
| 482 |
+
)
|
| 483 |
+
|
| 484 |
+
return result
|
| 485 |
+
|
| 486 |
+
except HTTPException:
|
| 487 |
+
raise
|
| 488 |
+
except Exception as e:
|
| 489 |
+
logger.error(f"❌ Erreur chunking: {str(e)}")
|
| 490 |
+
|
| 491 |
+
# Nettoyage mémoire d'urgence
|
| 492 |
+
try:
|
| 493 |
+
await pipeline._cleanup_memory_v4()
|
| 494 |
+
gc.collect()
|
| 495 |
+
except:
|
| 496 |
+
pass
|
| 497 |
+
|
| 498 |
+
raise HTTPException(
|
| 499 |
+
status_code=500,
|
| 500 |
+
detail=f"❌ Erreur traitement: {str(e)}"
|
| 501 |
+
)
|
| 502 |
+
|
| 503 |
+
@app.post("/chunk-batch", tags=["chunking"])
|
| 504 |
+
async def chunk_batch(requests: List[ChunkRequest]):
|
| 505 |
+
"""
|
| 506 |
+
📦 Traitement par lots optimisé pour HF Space gratuit
|
| 507 |
+
|
| 508 |
+
**Limites:**
|
| 509 |
+
- Maximum 3 textes par lot
|
| 510 |
+
- Traitement séquentiel pour économiser la mémoire
|
| 511 |
+
"""
|
| 512 |
+
|
| 513 |
+
# Validation limite batch pour Space gratuit
|
| 514 |
+
max_batch_size = 3
|
| 515 |
+
if len(requests) > max_batch_size:
|
| 516 |
+
raise HTTPException(
|
| 517 |
+
status_code=400,
|
| 518 |
+
detail=f"❌ Maximum {max_batch_size} textes par lot sur HF Space gratuit"
|
| 519 |
+
)
|
| 520 |
+
|
| 521 |
+
if pipeline is None:
|
| 522 |
+
raise HTTPException(status_code=503, detail="❌ Pipeline non initialisé")
|
| 523 |
+
|
| 524 |
+
start_time = time.time()
|
| 525 |
+
results = []
|
| 526 |
+
|
| 527 |
+
try:
|
| 528 |
+
logger.info(f"📦 Début batch: {len(requests)} textes")
|
| 529 |
+
|
| 530 |
+
for idx, request in enumerate(requests):
|
| 531 |
+
try:
|
| 532 |
+
logger.info(f" 📝 Traitement {idx+1}/{len(requests)}: {request.titre or 'Sans titre'}")
|
| 533 |
+
|
| 534 |
+
result = await pipeline.process_text(request)
|
| 535 |
+
results.append({
|
| 536 |
+
"success": True,
|
| 537 |
+
"index": idx,
|
| 538 |
+
"source_id": request.source_id,
|
| 539 |
+
"result": result
|
| 540 |
+
})
|
| 541 |
+
|
| 542 |
+
# Nettoyage entre chaque traitement
|
| 543 |
+
if idx < len(requests) - 1: # Pas pour le dernier
|
| 544 |
+
await pipeline._cleanup_memory_v4()
|
| 545 |
+
|
| 546 |
+
except Exception as e:
|
| 547 |
+
logger.error(f"❌ Erreur batch item {idx}: {e}")
|
| 548 |
+
results.append({
|
| 549 |
+
"success": False,
|
| 550 |
+
"index": idx,
|
| 551 |
+
"source_id": request.source_id or f"item_{idx}",
|
| 552 |
+
"error": str(e)
|
| 553 |
+
})
|
| 554 |
+
|
| 555 |
+
total_time = time.time() - start_time
|
| 556 |
+
successful_results = [r for r in results if r["success"]]
|
| 557 |
+
|
| 558 |
+
# Nettoyage final
|
| 559 |
+
try:
|
| 560 |
+
await pipeline._cleanup_memory_v4()
|
| 561 |
+
except:
|
| 562 |
+
pass
|
| 563 |
+
|
| 564 |
+
logger.info(
|
| 565 |
+
f"✅ Batch terminé: {len(successful_results)}/{len(requests)} succès "
|
| 566 |
+
f"en {total_time:.2f}s"
|
| 567 |
+
)
|
| 568 |
+
|
| 569 |
+
return {
|
| 570 |
+
"results": results,
|
| 571 |
+
"summary": {
|
| 572 |
+
"total_processed": len(requests),
|
| 573 |
+
"successful": len(successful_results),
|
| 574 |
+
"failed": len(requests) - len(successful_results),
|
| 575 |
+
"success_rate": f"{(len(successful_results)/len(requests)*100):.1f}%",
|
| 576 |
+
"total_processing_time": round(total_time, 2),
|
| 577 |
+
"avg_time_per_item": round(total_time / len(requests), 2)
|
| 578 |
+
},
|
| 579 |
+
"version": "4.0.0",
|
| 580 |
+
"timestamp": time.time()
|
| 581 |
+
}
|
| 582 |
+
|
| 583 |
+
except Exception as e:
|
| 584 |
+
logger.error(f"❌ Erreur batch global: {e}")
|
| 585 |
+
gc.collect()
|
| 586 |
+
raise HTTPException(
|
| 587 |
+
status_code=500,
|
| 588 |
+
detail=f"❌ Erreur traitement batch: {str(e)}"
|
| 589 |
+
)
|
| 590 |
+
|
| 591 |
+
@app.post("/test", tags=["test"])
|
| 592 |
+
async def test_chunking():
|
| 593 |
+
"""🧪 Test de validation du déploiement"""
|
| 594 |
+
if pipeline is None:
|
| 595 |
+
raise HTTPException(status_code=503, detail="❌ Pipeline non initialisé")
|
| 596 |
+
|
| 597 |
+
try:
|
| 598 |
+
test_request = ChunkRequest(
|
| 599 |
+
text="""
|
| 600 |
+
Ceci est un test complet de chunking sémantique intelligent v4.0.
|
| 601 |
+
|
| 602 |
+
Le système utilise Chonkie pour le découpage sémantique avancé.
|
| 603 |
+
Il génère des relations hiérarchiques bidirectionnelles entre les chunks.
|
| 604 |
+
|
| 605 |
+
L'export Obsidian utilise le format [[Titre]], id pour les liens.
|
| 606 |
+
Les agents IA reçoivent une base de connaissance parfaitement structurée.
|
| 607 |
+
|
| 608 |
+
Ce test valide toutes les fonctionnalités principales du système.
|
| 609 |
+
""",
|
| 610 |
+
titre="Test Validation v4.0",
|
| 611 |
+
source_id="validation_test_v4",
|
| 612 |
+
include_metadata=True,
|
| 613 |
+
export_obsidian=True,
|
| 614 |
+
export_agents=True
|
| 615 |
+
)
|
| 616 |
+
|
| 617 |
+
start_time = time.time()
|
| 618 |
+
result = await pipeline.process_text(test_request)
|
| 619 |
+
test_time = time.time() - start_time
|
| 620 |
+
|
| 621 |
+
# Vérifications détaillées
|
| 622 |
+
checks = {
|
| 623 |
+
"chunking_functional": result.total_chunks > 0,
|
| 624 |
+
"metadata_extracted": len(result.chunks[0].metadata.keywords) > 0 if result.chunks else False,
|
| 625 |
+
"hierarchy_built": len(result.hierarchy) > 0,
|
| 626 |
+
"obsidian_export": result.obsidian_export is not None,
|
| 627 |
+
"agent_knowledge": result.agent_knowledge is not None,
|
| 628 |
+
"processing_time_ok": test_time < 30 # Moins de 30s
|
| 629 |
+
}
|
| 630 |
+
|
| 631 |
+
success_rate = sum(checks.values()) / len(checks) * 100
|
| 632 |
+
|
| 633 |
+
return {
|
| 634 |
+
"test_status": "✅ SUCCESS" if success_rate == 100 else "⚠️ PARTIAL",
|
| 635 |
+
"success_rate": f"{success_rate:.1f}%",
|
| 636 |
+
"results": {
|
| 637 |
+
"chunks_generated": result.total_chunks,
|
| 638 |
+
"tokens_processed": result.total_tokens,
|
| 639 |
+
"processing_time": round(test_time, 2),
|
| 640 |
+
"hierarchy_levels": len(result.hierarchy)
|
| 641 |
+
},
|
| 642 |
+
"checks": checks,
|
| 643 |
+
"features_validated": [
|
| 644 |
+
"✅ Chunking sémantique Chonkie" if checks["chunking_functional"] else "❌ Chunking failed",
|
| 645 |
+
"✅ Extraction métadonnées" if checks["metadata_extracted"] else "❌ Metadata failed",
|
| 646 |
+
"✅ Relations hiérarchiques" if checks["hierarchy_built"] else "❌ Hierarchy failed",
|
| 647 |
+
"✅ Export Obsidian" if checks["obsidian_export"] else "❌ Obsidian failed",
|
| 648 |
+
"✅ Base agents IA" if checks["agent_knowledge"] else "❌ Agents failed"
|
| 649 |
+
],
|
| 650 |
+
"version": "4.0.0",
|
| 651 |
+
"timestamp": time.time()
|
| 652 |
+
}
|
| 653 |
+
|
| 654 |
+
except Exception as e:
|
| 655 |
+
logger.error(f"❌ Test validation échoué: {e}")
|
| 656 |
+
raise HTTPException(
|
| 657 |
+
status_code=500,
|
| 658 |
+
detail=f"❌ Test échoué: {str(e)}"
|
| 659 |
+
)
|
| 660 |
+
|
| 661 |
+
@app.get("/ping", tags=["test"])
|
| 662 |
+
async def ping():
|
| 663 |
+
"""🏓 Test de connectivité simple"""
|
| 664 |
+
return {
|
| 665 |
+
"ping": "pong",
|
| 666 |
+
"timestamp": time.time(),
|
| 667 |
+
"version": "4.0.0",
|
| 668 |
+
"status": "🟢 Opérationnel" if pipeline else "🔴 Non initialisé"
|
| 669 |
+
}
|
| 670 |
+
|
| 671 |
+
# ✅ GESTION D'ERREURS PERSONNALISÉE
|
| 672 |
+
|
| 673 |
+
@app.exception_handler(404)
|
| 674 |
+
async def not_found_handler(request: Request, exc):
|
| 675 |
+
"""Gestionnaire 404 personnalisé"""
|
| 676 |
+
return JSONResponse(
|
| 677 |
+
status_code=404,
|
| 678 |
+
content={
|
| 679 |
+
"error": "❌ Endpoint non trouvé",
|
| 680 |
+
"message": f"L'endpoint {request.url.path} n'existe pas",
|
| 681 |
+
"available_endpoints": {
|
| 682 |
+
"chunking": ["/chunk", "/chunk-batch"],
|
| 683 |
+
"monitoring": ["/health", "/config", "/stats"],
|
| 684 |
+
"test": ["/test", "/ping"],
|
| 685 |
+
"docs": ["/docs", "/redoc"]
|
| 686 |
+
},
|
| 687 |
+
"suggestion": "Consultez /docs pour la documentation complète",
|
| 688 |
+
"version": "4.0.0"
|
| 689 |
+
}
|
| 690 |
+
)
|
| 691 |
+
|
| 692 |
+
@app.exception_handler(422)
|
| 693 |
+
async def validation_exception_handler(request: Request, exc):
|
| 694 |
+
"""Gestionnaire erreurs de validation Pydantic"""
|
| 695 |
+
return JSONResponse(
|
| 696 |
+
status_code=422,
|
| 697 |
+
content={
|
| 698 |
+
"error": "❌ Erreur de validation",
|
| 699 |
+
"message": "Les données envoyées ne respectent pas le format attendu",
|
| 700 |
+
"detail": str(exc),
|
| 701 |
+
"hint": "Vérifiez la structure de votre requête JSON",
|
| 702 |
+
"documentation": "/docs",
|
| 703 |
+
"version": "4.0.0"
|
| 704 |
+
}
|
| 705 |
+
)
|
| 706 |
+
|
| 707 |
+
# ✅ POINT D'ENTRÉE PRINCIPAL
|
| 708 |
+
if __name__ == "__main__":
|
| 709 |
+
import uvicorn
|
| 710 |
+
|
| 711 |
+
logger.info("🚀 Démarrage direct du serveur...")
|
| 712 |
+
|
| 713 |
+
# Configuration optimisée pour HF Space gratuit
|
| 714 |
+
uvicorn.run(
|
| 715 |
+
"app:app",
|
| 716 |
+
host="0.0.0.0",
|
| 717 |
+
port=7860, # Port standard HF Space
|
| 718 |
+
reload=False, # Mode production
|
| 719 |
+
access_log=False, # Économie ressources
|
| 720 |
+
log_level="info",
|
| 721 |
+
workers=1, # HF Space gratuit = 1 worker
|
| 722 |
+
timeout_keep_alive=30,
|
| 723 |
+
limit_concurrency=10, # Limite connexions simultanées
|
| 724 |
+
timeout_graceful_shutdown=30
|
| 725 |
+
)
|
chunker_pipeline.py
ADDED
|
@@ -0,0 +1,1582 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Smart Chunker Pipeline v4.0 - VERSION FINALE FUSIONNÉE
|
| 3 |
+
Combine les corrections GPT + toutes les fonctionnalités avancées originales
|
| 4 |
+
Compatible LlamaIndex v0.12 + HuggingFace + CustomRecursiveChunker
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
import os
|
| 8 |
+
import sys
|
| 9 |
+
import logging
|
| 10 |
+
import yaml
|
| 11 |
+
import asyncio
|
| 12 |
+
import tempfile
|
| 13 |
+
import time
|
| 14 |
+
import hashlib
|
| 15 |
+
import re
|
| 16 |
+
import collections
|
| 17 |
+
from typing import Dict, List, Any, Optional, Union, Tuple
|
| 18 |
+
from pathlib import Path
|
| 19 |
+
from dataclasses import dataclass, field
|
| 20 |
+
from enum import Enum
|
| 21 |
+
|
| 22 |
+
# LlamaIndex v0.12 imports
|
| 23 |
+
from llama_index.core import Settings, Document, SimpleDirectoryReader
|
| 24 |
+
from llama_index.core.schema import BaseEmbedding, BaseNode, TextNode
|
| 25 |
+
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
|
| 26 |
+
from llama_index.llms.huggingface import HuggingFaceLLM
|
| 27 |
+
from llama_index.core.node_parser import (
|
| 28 |
+
SentenceSplitter,
|
| 29 |
+
SemanticSplitterNodeParser,
|
| 30 |
+
TokenTextSplitter,
|
| 31 |
+
HierarchicalNodeParser
|
| 32 |
+
)
|
| 33 |
+
|
| 34 |
+
# Transformers et models
|
| 35 |
+
from sentence_transformers import SentenceTransformer
|
| 36 |
+
import torch
|
| 37 |
+
import gc
|
| 38 |
+
|
| 39 |
+
# Import schemas
|
| 40 |
+
from schemas import ChunkRequest, ChunkResponse, SemanticChunk, ChunkLevel, ChunkMetadata
|
| 41 |
+
|
| 42 |
+
# Import du chunker personnalisé
|
| 43 |
+
from custom_recursive_chunker import CustomRecursiveChunker, ChunkResult
|
| 44 |
+
|
| 45 |
+
# Configuration logging
|
| 46 |
+
logging.basicConfig(
|
| 47 |
+
level=logging.INFO,
|
| 48 |
+
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
|
| 49 |
+
)
|
| 50 |
+
logger = logging.getLogger(__name__)
|
| 51 |
+
|
| 52 |
+
# Vérification disponibilité Chonkie (optionnel)
|
| 53 |
+
try:
|
| 54 |
+
from chonkie import SemanticChunker
|
| 55 |
+
CHONKIE_AVAILABLE = True
|
| 56 |
+
logger.info("✅ Chonkie disponible")
|
| 57 |
+
except ImportError:
|
| 58 |
+
CHONKIE_AVAILABLE = False
|
| 59 |
+
logger.warning("⚠️ Chonkie non disponible - utilisation CustomRecursiveChunker")
|
| 60 |
+
|
| 61 |
+
# ===================================
|
| 62 |
+
# CONFIGURATION ENVIRONNEMENT HF SPACE
|
| 63 |
+
# ===================================
|
| 64 |
+
|
| 65 |
+
def setup_environment():
|
| 66 |
+
"""Configuration optimisée pour Hugging Face Space gratuit"""
|
| 67 |
+
cache_base = os.path.join(tempfile.gettempdir(), "cache")
|
| 68 |
+
os.environ["HF_HOME"] = os.path.join(cache_base, "huggingface")
|
| 69 |
+
os.environ["TRANSFORMERS_CACHE"] = os.path.join(cache_base, "transformers")
|
| 70 |
+
os.environ["HF_HUB_CACHE"] = os.path.join(cache_base, "hub")
|
| 71 |
+
|
| 72 |
+
# Optimisations performance
|
| 73 |
+
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
| 74 |
+
os.environ["HF_HUB_DISABLE_PROGRESS_BARS"] = "1"
|
| 75 |
+
os.environ["TRANSFORMERS_VERBOSITY"] = "error"
|
| 76 |
+
os.environ["PYTHONUNBUFFERED"] = "1"
|
| 77 |
+
|
| 78 |
+
# Création dossiers cache sécurisés
|
| 79 |
+
cache_dirs = [
|
| 80 |
+
os.environ["HF_HOME"],
|
| 81 |
+
os.environ["TRANSFORMERS_CACHE"],
|
| 82 |
+
os.environ["HF_HUB_CACHE"],
|
| 83 |
+
os.path.join(cache_base, "llm"),
|
| 84 |
+
os.path.join(cache_base, "embeddings"),
|
| 85 |
+
os.path.join(cache_base, "logs")
|
| 86 |
+
]
|
| 87 |
+
|
| 88 |
+
for cache_dir in cache_dirs:
|
| 89 |
+
try:
|
| 90 |
+
os.makedirs(cache_dir, exist_ok=True)
|
| 91 |
+
os.chmod(cache_dir, 0o755)
|
| 92 |
+
except Exception as e:
|
| 93 |
+
logger.warning(f"⚠️ Impossible de créer {cache_dir}: {e}")
|
| 94 |
+
|
| 95 |
+
logger.info("✅ Environnement HF Space configuré")
|
| 96 |
+
|
| 97 |
+
# Configuration environnement
|
| 98 |
+
setup_environment()
|
| 99 |
+
|
| 100 |
+
# ===================================
|
| 101 |
+
# WRAPPER EMBEDDING COMPATIBLE
|
| 102 |
+
# ===================================
|
| 103 |
+
|
| 104 |
+
class EmbeddingWrapper(BaseEmbedding):
|
| 105 |
+
"""Wrapper pour compatibilité entre SentenceTransformer et BaseEmbedding"""
|
| 106 |
+
|
| 107 |
+
def __init__(self, model):
|
| 108 |
+
super().__init__()
|
| 109 |
+
self.model = model
|
| 110 |
+
self._validate_model()
|
| 111 |
+
|
| 112 |
+
def _validate_model(self):
|
| 113 |
+
"""Valide que le modèle fonctionne correctement"""
|
| 114 |
+
try:
|
| 115 |
+
test_result = self.model.encode(["test"], convert_to_tensor=False)
|
| 116 |
+
if not hasattr(test_result, "__len__") or len(test_result) == 0:
|
| 117 |
+
raise ValueError("Le modèle ne retourne pas d'embeddings valides")
|
| 118 |
+
logger.info("✅ Le modèle d'embedding est conforme à BaseEmbedding (test réussi)")
|
| 119 |
+
except Exception as e:
|
| 120 |
+
logger.error(f"❌ Validation du modèle d'embedding échouée: {e}")
|
| 121 |
+
raise
|
| 122 |
+
|
| 123 |
+
def _get_query_embedding(self, query: str) -> List[float]:
|
| 124 |
+
"""Obtient l'embedding d'une requête"""
|
| 125 |
+
return self.model.encode([query], convert_to_tensor=False)[0].tolist()
|
| 126 |
+
|
| 127 |
+
def _get_text_embedding(self, text: str) -> List[float]:
|
| 128 |
+
"""Obtient l'embedding d'un texte"""
|
| 129 |
+
return self.model.encode([text], convert_to_tensor=False)[0].tolist()
|
| 130 |
+
|
| 131 |
+
async def _aget_query_embedding(self, query: str) -> List[float]:
|
| 132 |
+
"""Version async de _get_query_embedding"""
|
| 133 |
+
return self._get_query_embedding(query)
|
| 134 |
+
|
| 135 |
+
async def _aget_text_embedding(self, text: str) -> List[float]:
|
| 136 |
+
"""Version async de _get_text_embedding"""
|
| 137 |
+
return self._get_text_embedding(text)
|
| 138 |
+
|
| 139 |
+
# ===================================
|
| 140 |
+
# PIPELINE PRINCIPAL
|
| 141 |
+
# ===================================
|
| 142 |
+
|
| 143 |
+
class SmartChunkerPipeline:
|
| 144 |
+
"""Pipeline de chunking intelligent v4.0 avec toutes les fonctionnalités avancées"""
|
| 145 |
+
|
| 146 |
+
def __init__(self, config_path: str = "config.yaml"):
|
| 147 |
+
self.config_path = config_path
|
| 148 |
+
self.config = self._load_config()
|
| 149 |
+
|
| 150 |
+
# Composants principaux
|
| 151 |
+
self.llm = None
|
| 152 |
+
self.embed_model = None
|
| 153 |
+
self.custom_recursive_chunker = None
|
| 154 |
+
|
| 155 |
+
# Chunkers Chonkie (optionnels)
|
| 156 |
+
self.chonkie_semantic = None
|
| 157 |
+
self.chonkie_recursive = None
|
| 158 |
+
|
| 159 |
+
# Parsers LlamaIndex (fallback)
|
| 160 |
+
self.sentence_splitter = None
|
| 161 |
+
self.semantic_splitter = None
|
| 162 |
+
self.token_splitter = None
|
| 163 |
+
self.hierarchical_parser = None
|
| 164 |
+
|
| 165 |
+
# Registres pour relations bidirectionnelles
|
| 166 |
+
self._chunk_registry: Dict[str, SemanticChunk] = {}
|
| 167 |
+
self._hierarchy_cache: Dict[str, List[str]] = {}
|
| 168 |
+
|
| 169 |
+
self._is_initialized = False
|
| 170 |
+
|
| 171 |
+
logger.info("🚀 SmartChunkerPipeline v4.0 initialisé (version fusionnée)")
|
| 172 |
+
|
| 173 |
+
def _load_config(self) -> Dict[str, Any]:
|
| 174 |
+
"""Charge la configuration depuis le fichier YAML"""
|
| 175 |
+
try:
|
| 176 |
+
if os.path.exists(self.config_path):
|
| 177 |
+
with open(self.config_path, 'r', encoding='utf-8') as f:
|
| 178 |
+
config = yaml.safe_load(f)
|
| 179 |
+
logger.info(f"✅ Configuration chargée depuis {self.config_path}")
|
| 180 |
+
return config
|
| 181 |
+
else:
|
| 182 |
+
logger.warning(f"⚠️ Config {self.config_path} non trouvée, utilisation config par défaut")
|
| 183 |
+
return self._get_default_config()
|
| 184 |
+
except Exception as e:
|
| 185 |
+
logger.error(f"❌ Erreur chargement config: {e}")
|
| 186 |
+
return self._get_default_config()
|
| 187 |
+
|
| 188 |
+
def _get_default_config(self) -> Dict[str, Any]:
|
| 189 |
+
"""Configuration par défaut optimisée v4.0"""
|
| 190 |
+
return {
|
| 191 |
+
"models": {
|
| 192 |
+
"llm": {
|
| 193 |
+
"provider": "huggingface",
|
| 194 |
+
"model_name": "llama-2-7b-chat",
|
| 195 |
+
"temperature": 0.1,
|
| 196 |
+
"max_tokens": 512,
|
| 197 |
+
"device": "cpu",
|
| 198 |
+
"cache_dir": os.path.join(tempfile.gettempdir(), "llm")
|
| 199 |
+
},
|
| 200 |
+
"embedding": {
|
| 201 |
+
"provider": "huggingface",
|
| 202 |
+
"model_name": "sentence-transformers/all-MiniLM-L6-v2",
|
| 203 |
+
"cache_dir": os.path.join(tempfile.gettempdir(), "embeddings"),
|
| 204 |
+
"max_length": 512,
|
| 205 |
+
"normalize": True,
|
| 206 |
+
"device": "cpu"
|
| 207 |
+
}
|
| 208 |
+
},
|
| 209 |
+
"chunking": {
|
| 210 |
+
"custom_recursive": {
|
| 211 |
+
"enabled": True,
|
| 212 |
+
"chunk_sizes": [2048, 512, 128],
|
| 213 |
+
"separators": ["\n\n", "\n", ".", "!", "?", "—"],
|
| 214 |
+
"overlap_ratio": 0.1,
|
| 215 |
+
"min_chunk_size": 50,
|
| 216 |
+
"semantic_threshold": 0.75
|
| 217 |
+
},
|
| 218 |
+
"chonkie": {
|
| 219 |
+
"semantic": {
|
| 220 |
+
"enabled": True,
|
| 221 |
+
"threshold": 0.75,
|
| 222 |
+
"chunk_size": 512,
|
| 223 |
+
"min_sentences": 1
|
| 224 |
+
},
|
| 225 |
+
"recursive": {
|
| 226 |
+
"enabled": True,
|
| 227 |
+
"chunk_sizes": [2048, 512, 128],
|
| 228 |
+
"overlap": 20,
|
| 229 |
+
"separators": ["\n\n", "\n", ".", "!", "?"]
|
| 230 |
+
}
|
| 231 |
+
}
|
| 232 |
+
},
|
| 233 |
+
"obsidian": {
|
| 234 |
+
"parent_format": "[[{title}]], {id}",
|
| 235 |
+
"use_bidirectional_links": True,
|
| 236 |
+
"vault_name": "Smart_Chunks"
|
| 237 |
+
},
|
| 238 |
+
"performance": {
|
| 239 |
+
"memory": {
|
| 240 |
+
"enable_garbage_collection": True,
|
| 241 |
+
"cleanup_interval": 100
|
| 242 |
+
}
|
| 243 |
+
}
|
| 244 |
+
}
|
| 245 |
+
|
| 246 |
+
async def initialize(self) -> bool:
|
| 247 |
+
"""Initialise tous les composants du pipeline"""
|
| 248 |
+
if self._is_initialized:
|
| 249 |
+
logger.info("✅ Pipeline déjà initialisé")
|
| 250 |
+
return True
|
| 251 |
+
|
| 252 |
+
try:
|
| 253 |
+
logger.info("🔄 Initialisation SmartChunkerPipeline v4.0...")
|
| 254 |
+
|
| 255 |
+
# 1. Modèle LLM
|
| 256 |
+
await self._init_llm()
|
| 257 |
+
|
| 258 |
+
# 2. Modèle d'embedding
|
| 259 |
+
await self._init_embedding()
|
| 260 |
+
|
| 261 |
+
# 3. Configuration Settings LlamaIndex v0.12
|
| 262 |
+
Settings.llm = self.llm
|
| 263 |
+
Settings.embed_model = self.embed_model
|
| 264 |
+
Settings.chunk_size = 512
|
| 265 |
+
Settings.chunk_overlap = 20
|
| 266 |
+
|
| 267 |
+
# 4. Chunker personnalisé (principal)
|
| 268 |
+
await self._init_custom_recursive_chunker()
|
| 269 |
+
|
| 270 |
+
# 5. Chonkie chunkers (optionnels)
|
| 271 |
+
if CHONKIE_AVAILABLE:
|
| 272 |
+
await self._init_chonkie_chunkers()
|
| 273 |
+
else:
|
| 274 |
+
logger.warning("⚠️ Chonkie non disponible - utilisation CustomRecursiveChunker uniquement")
|
| 275 |
+
|
| 276 |
+
# 6. Parsers LlamaIndex (fallback)
|
| 277 |
+
await self._init_llamaindex_parsers()
|
| 278 |
+
|
| 279 |
+
self._is_initialized = True
|
| 280 |
+
logger.info("✅ SmartChunkerPipeline v4.0 initialisé avec succès")
|
| 281 |
+
return True
|
| 282 |
+
|
| 283 |
+
except Exception as e:
|
| 284 |
+
logger.error(f"❌ Erreur initialisation chunker v4.0: {e}")
|
| 285 |
+
raise
|
| 286 |
+
|
| 287 |
+
async def _init_llm(self):
|
| 288 |
+
"""Initialise le modèle LLM"""
|
| 289 |
+
llm_config = self.config.get("models", {}).get("llm", {})
|
| 290 |
+
|
| 291 |
+
try:
|
| 292 |
+
if llm_config.get("provider") == "huggingface":
|
| 293 |
+
model_name = llm_config.get("model_name", "llama-2-7b-chat")
|
| 294 |
+
|
| 295 |
+
self.llm = HuggingFaceLLM(
|
| 296 |
+
model_name=model_name,
|
| 297 |
+
tokenizer_name=model_name,
|
| 298 |
+
max_new_tokens=llm_config.get("max_tokens", 512),
|
| 299 |
+
device_map="auto" if torch.cuda.is_available() else "cpu",
|
| 300 |
+
model_kwargs={
|
| 301 |
+
"torch_dtype": torch.float16 if torch.cuda.is_available() else torch.float32,
|
| 302 |
+
"cache_dir": llm_config.get("cache_dir")
|
| 303 |
+
}
|
| 304 |
+
)
|
| 305 |
+
|
| 306 |
+
logger.info(f"✅ Modèle {model_name} chargé avec succès !")
|
| 307 |
+
else:
|
| 308 |
+
raise ValueError(f"Provider LLM non supporté: {llm_config.get('provider')}")
|
| 309 |
+
|
| 310 |
+
except Exception as e:
|
| 311 |
+
logger.error(f"❌ Erreur chargement LLM: {e}")
|
| 312 |
+
raise
|
| 313 |
+
|
| 314 |
+
async def _init_embedding(self):
|
| 315 |
+
"""Initialise le modèle d'embedding"""
|
| 316 |
+
embedding_config = self.config.get("models", {}).get("embedding", {})
|
| 317 |
+
|
| 318 |
+
try:
|
| 319 |
+
if embedding_config.get("provider") == "huggingface":
|
| 320 |
+
model_name = embedding_config.get("model_name", "sentence-transformers/all-MiniLM-L6-v2")
|
| 321 |
+
|
| 322 |
+
# Utilise SentenceTransformer directement puis wrappe
|
| 323 |
+
sentence_transformer = SentenceTransformer(
|
| 324 |
+
model_name,
|
| 325 |
+
cache_folder=embedding_config.get("cache_dir"),
|
| 326 |
+
device=embedding_config.get("device", "cpu")
|
| 327 |
+
)
|
| 328 |
+
|
| 329 |
+
# Wrappe pour compatibilité LlamaIndex
|
| 330 |
+
self.embed_model = EmbeddingWrapper(sentence_transformer)
|
| 331 |
+
|
| 332 |
+
logger.info(f"✅ Modèle d'embedding {model_name} chargé avec succès")
|
| 333 |
+
else:
|
| 334 |
+
raise ValueError(f"Provider embedding non supporté: {embedding_config.get('provider')}")
|
| 335 |
+
|
| 336 |
+
except Exception as e:
|
| 337 |
+
logger.error(f"❌ Erreur chargement embedding: {e}")
|
| 338 |
+
raise
|
| 339 |
+
|
| 340 |
+
async def _init_custom_recursive_chunker(self):
|
| 341 |
+
"""Initialise le chunker récursif personnalisé"""
|
| 342 |
+
custom_config = self.config.get("chunking", {}).get("custom_recursive", {})
|
| 343 |
+
|
| 344 |
+
if not custom_config.get("enabled", True):
|
| 345 |
+
logger.info("⚠️ CustomRecursiveChunker désactivé")
|
| 346 |
+
return
|
| 347 |
+
|
| 348 |
+
try:
|
| 349 |
+
self.custom_recursive_chunker = CustomRecursiveChunker(
|
| 350 |
+
embed_model=self.embed_model,
|
| 351 |
+
chunk_sizes=custom_config.get("chunk_sizes", [2048, 512, 128]),
|
| 352 |
+
separators=custom_config.get("separators", ["\n\n", "\n", ".", "!", "?", "—"]),
|
| 353 |
+
overlap_ratio=custom_config.get("overlap_ratio", 0.1),
|
| 354 |
+
min_chunk_size=custom_config.get("min_chunk_size", 50),
|
| 355 |
+
semantic_threshold=custom_config.get("semantic_threshold", 0.75)
|
| 356 |
+
)
|
| 357 |
+
|
| 358 |
+
logger.info("✅ CustomRecursiveChunker initialisé avec succès")
|
| 359 |
+
|
| 360 |
+
except Exception as e:
|
| 361 |
+
logger.error(f"❌ Erreur initialisation CustomRecursiveChunker: {e}")
|
| 362 |
+
raise
|
| 363 |
+
|
| 364 |
+
async def _init_chonkie_chunkers(self):
|
| 365 |
+
"""Initialise les chunkers Chonkie (optionnel)"""
|
| 366 |
+
semantic_config = self.config.get("chunking", {}).get("chonkie", {}).get("semantic", {})
|
| 367 |
+
|
| 368 |
+
# SemanticChunker Chonkie
|
| 369 |
+
if semantic_config.get("enabled", True):
|
| 370 |
+
try:
|
| 371 |
+
# Utilise uniquement les paramètres supportés par chonkie 1.0.10
|
| 372 |
+
self.chonkie_semantic = SemanticChunker(
|
| 373 |
+
threshold=semantic_config.get("threshold", 0.75),
|
| 374 |
+
chunk_size=semantic_config.get("chunk_size", 512),
|
| 375 |
+
min_sentences=semantic_config.get("min_sentences", 1)
|
| 376 |
+
)
|
| 377 |
+
|
| 378 |
+
logger.info("✅ SemanticChunker (Chonkie) initialisé avec succès")
|
| 379 |
+
|
| 380 |
+
except Exception as e:
|
| 381 |
+
logger.warning(f"⚠️ Erreur initialisation Chonkie SemanticChunker: {e}")
|
| 382 |
+
self.chonkie_semantic = None
|
| 383 |
+
|
| 384 |
+
async def _init_llamaindex_parsers(self):
|
| 385 |
+
"""Initialise les parsers LlamaIndex (fallback)"""
|
| 386 |
+
try:
|
| 387 |
+
# SentenceSplitter standard
|
| 388 |
+
self.sentence_splitter = SentenceSplitter(
|
| 389 |
+
chunk_size=512,
|
| 390 |
+
chunk_overlap=20
|
| 391 |
+
)
|
| 392 |
+
|
| 393 |
+
# SemanticSplitterNodeParser
|
| 394 |
+
if self.embed_model:
|
| 395 |
+
self.semantic_splitter = SemanticSplitterNodeParser(
|
| 396 |
+
embed_model=self.embed_model,
|
| 397 |
+
buffer_size=1,
|
| 398 |
+
breakpoint_percentile_threshold=95
|
| 399 |
+
)
|
| 400 |
+
|
| 401 |
+
# TokenTextSplitter
|
| 402 |
+
self.token_splitter = TokenTextSplitter(
|
| 403 |
+
chunk_size=512,
|
| 404 |
+
chunk_overlap=20
|
| 405 |
+
)
|
| 406 |
+
|
| 407 |
+
# HierarchicalNodeParser
|
| 408 |
+
self.hierarchical_parser = HierarchicalNodeParser.from_defaults(
|
| 409 |
+
chunk_sizes=[2048, 512, 128]
|
| 410 |
+
)
|
| 411 |
+
|
| 412 |
+
logger.info("✅ Parsers LlamaIndex v0.12 initialisés")
|
| 413 |
+
|
| 414 |
+
except Exception as e:
|
| 415 |
+
logger.warning(f"⚠️ Erreur initialisation parsers LlamaIndex: {e}")
|
| 416 |
+
|
| 417 |
+
# ===================================
|
| 418 |
+
# MÉTHODE PRINCIPALE DE TRAITEMENT
|
| 419 |
+
# ===================================
|
| 420 |
+
|
| 421 |
+
async def process_text(self, request: ChunkRequest) -> ChunkResponse:
|
| 422 |
+
"""
|
| 423 |
+
Méthode principale de traitement avec toutes les fonctionnalités avancées
|
| 424 |
+
|
| 425 |
+
Args:
|
| 426 |
+
request: Requête de chunking avec texte et options
|
| 427 |
+
|
| 428 |
+
Returns:
|
| 429 |
+
Réponse complète avec chunks enrichis, exports et métadonnées
|
| 430 |
+
"""
|
| 431 |
+
if not self._is_initialized:
|
| 432 |
+
await self.initialize()
|
| 433 |
+
|
| 434 |
+
start_time = time.time()
|
| 435 |
+
|
| 436 |
+
try:
|
| 437 |
+
logger.info(f"🚀 Début traitement v4.0 - {len(request.text)} caractères")
|
| 438 |
+
|
| 439 |
+
# 1. Preprocessing et nettoyage amélioré
|
| 440 |
+
cleaned_text = self._preprocess_text_v4(request.text)
|
| 441 |
+
|
| 442 |
+
# 2. Détection structure automatique avancée
|
| 443 |
+
documents = await self._detect_structure_v4(cleaned_text, request)
|
| 444 |
+
|
| 445 |
+
# 3. ✅ Chunking hiérarchique avec CustomRecursiveChunker
|
| 446 |
+
if self.custom_recursive_chunker:
|
| 447 |
+
hierarchical_chunks = await self._apply_custom_hierarchical_chunking(documents, request)
|
| 448 |
+
else:
|
| 449 |
+
hierarchical_chunks = await self._apply_llamaindex_hierarchical_chunking(documents, request)
|
| 450 |
+
|
| 451 |
+
# 4. ✅ Chunking sémantique avec Chonkie SemanticChunker (si disponible)
|
| 452 |
+
if CHONKIE_AVAILABLE and self.chonkie_semantic:
|
| 453 |
+
semantic_chunks = await self._apply_chonkie_semantic_chunking(hierarchical_chunks, request)
|
| 454 |
+
else:
|
| 455 |
+
semantic_chunks = await self._apply_fallback_semantic_chunking(hierarchical_chunks, request)
|
| 456 |
+
|
| 457 |
+
# 5. ✅ Construction relations bidirectionnelles complètes
|
| 458 |
+
enriched_chunks = await self._build_bidirectional_relationships_v4(semantic_chunks)
|
| 459 |
+
|
| 460 |
+
# 6. Extraction concepts et métadonnées intelligentes
|
| 461 |
+
final_chunks = await self._enrich_with_intelligence_v4(enriched_chunks, request)
|
| 462 |
+
|
| 463 |
+
# 7. ✅ Génération exports avec format Obsidian corrigé
|
| 464 |
+
exports = await self._generate_exports_v4(final_chunks, request)
|
| 465 |
+
|
| 466 |
+
processing_time = time.time() - start_time
|
| 467 |
+
|
| 468 |
+
# 8. Nettoyage mémoire automatique HF Space
|
| 469 |
+
if self.config.get("performance", {}).get("memory", {}).get("enable_garbage_collection", True):
|
| 470 |
+
await self._cleanup_memory_v4()
|
| 471 |
+
|
| 472 |
+
# Construction réponse finale
|
| 473 |
+
response = ChunkResponse(
|
| 474 |
+
chunks=final_chunks,
|
| 475 |
+
hierarchy=self._build_hierarchy_levels_v4(final_chunks),
|
| 476 |
+
total_chunks=len(final_chunks),
|
| 477 |
+
total_tokens=sum(c.metadata.tokens_count for c in final_chunks),
|
| 478 |
+
processing_time=processing_time,
|
| 479 |
+
source_metadata=self._build_source_metadata_v4(request),
|
| 480 |
+
concept_graph=exports.get("concept_graph", {}),
|
| 481 |
+
obsidian_export=exports.get("obsidian"),
|
| 482 |
+
agent_knowledge=exports.get("agents")
|
| 483 |
+
)
|
| 484 |
+
|
| 485 |
+
logger.info(f"✅ Chunking v4.0 terminé: {len(final_chunks)} chunks en {processing_time:.2f}s")
|
| 486 |
+
return response
|
| 487 |
+
|
| 488 |
+
except Exception as e:
|
| 489 |
+
logger.error(f"❌ Erreur chunking v4.0: {e}")
|
| 490 |
+
raise
|
| 491 |
+
|
| 492 |
+
# ===================================
|
| 493 |
+
# PREPROCESSING AVANCÉ
|
| 494 |
+
# ===================================
|
| 495 |
+
|
| 496 |
+
def _preprocess_text_v4(self, text: str) -> str:
|
| 497 |
+
"""Preprocessing amélioré v4.0 avec détection patterns avancés"""
|
| 498 |
+
# Normalisation base
|
| 499 |
+
text = re.sub(r'\r\n|\r', '\n', text)
|
| 500 |
+
text = re.sub(r'\n\s*\n\s*\n+', '\n\n', text)
|
| 501 |
+
text = re.sub(r'[ \t]+', ' ', text)
|
| 502 |
+
|
| 503 |
+
# ✅ NOUVEAU v4.0: Nettoyage patterns spécifiques
|
| 504 |
+
# Suppression références inutiles
|
| 505 |
+
text = re.sub(r'\[?\d+\]?', '', text) # Références numériques [1], [2]
|
| 506 |
+
text = re.sub(r'http[s]?://\S+', '<URL>', text) # URLs remplacées par placeholder
|
| 507 |
+
|
| 508 |
+
# Préservation structures importantes
|
| 509 |
+
text = re.sub(r'^(#{1,6})\s+(.+)$', r'\1 \2', text, flags=re.MULTILINE) # Headers Markdown
|
| 510 |
+
text = re.sub(r'^\s*(\d+\.|\*|-|\+)\s+', r'\1 ', text, flags=re.MULTILINE) # Listes
|
| 511 |
+
|
| 512 |
+
return text.strip()
|
| 513 |
+
|
| 514 |
+
async def _detect_structure_v4(self, text: str, request: ChunkRequest) -> List[Document]:
|
| 515 |
+
"""Détection structure automatique avancée"""
|
| 516 |
+
|
| 517 |
+
# Détection patterns structurels
|
| 518 |
+
has_markdown_headers = bool(re.search(r'^#{1,6}\s+', text, re.MULTILINE))
|
| 519 |
+
has_numbered_sections = bool(re.search(r'^\d+\.\s+[A-Z]', text, re.MULTILINE))
|
| 520 |
+
has_bullet_points = bool(re.search(r'^\s*[*-+]\s+', text, re.MULTILINE))
|
| 521 |
+
|
| 522 |
+
# Métadonnées structure détectée
|
| 523 |
+
structure_metadata = {
|
| 524 |
+
"has_markdown_headers": has_markdown_headers,
|
| 525 |
+
"has_numbered_sections": has_numbered_sections,
|
| 526 |
+
"has_bullet_points": has_bullet_points,
|
| 527 |
+
"estimated_structure": "hierarchical" if has_markdown_headers else "linear",
|
| 528 |
+
"language": "fr", # À améliorer avec détection automatique
|
| 529 |
+
"source_id": request.source_id or "default"
|
| 530 |
+
}
|
| 531 |
+
|
| 532 |
+
# Création document avec métadonnées enrichies
|
| 533 |
+
metadata = {
|
| 534 |
+
"title": request.titre or "Document sans titre",
|
| 535 |
+
"source": request.source_id or "unknown",
|
| 536 |
+
"structure": structure_metadata,
|
| 537 |
+
"processing_timestamp": time.time()
|
| 538 |
+
}
|
| 539 |
+
|
| 540 |
+
if request.include_metadata and hasattr(request, 'metadata') and request.metadata:
|
| 541 |
+
metadata.update(request.metadata)
|
| 542 |
+
|
| 543 |
+
document = Document(
|
| 544 |
+
text=text,
|
| 545 |
+
metadata=metadata
|
| 546 |
+
)
|
| 547 |
+
|
| 548 |
+
return [document]
|
| 549 |
+
|
| 550 |
+
# ===================================
|
| 551 |
+
# CHUNKING HIÉRARCHIQUE PERSONNALISÉ
|
| 552 |
+
# ===================================
|
| 553 |
+
|
| 554 |
+
async def _apply_custom_hierarchical_chunking(self, documents: List[Document],
|
| 555 |
+
request: ChunkRequest) -> List[SemanticChunk]:
|
| 556 |
+
"""Chunking hiérarchique avec CustomRecursiveChunker"""
|
| 557 |
+
|
| 558 |
+
chunks = []
|
| 559 |
+
|
| 560 |
+
for doc_idx, document in enumerate(documents):
|
| 561 |
+
try:
|
| 562 |
+
# Chunking avec le CustomRecursiveChunker
|
| 563 |
+
chunk_results = await self.custom_recursive_chunker.chunk_text(
|
| 564 |
+
text=document.text,
|
| 565 |
+
metadata=document.metadata
|
| 566 |
+
)
|
| 567 |
+
|
| 568 |
+
# Conversion vers SemanticChunk avec enrichissement
|
| 569 |
+
for chunk_result in chunk_results:
|
| 570 |
+
semantic_chunk = self._convert_to_semantic_chunk_v4(
|
| 571 |
+
chunk_result, doc_idx, request
|
| 572 |
+
)
|
| 573 |
+
chunks.append(semantic_chunk)
|
| 574 |
+
|
| 575 |
+
# Enregistre dans le registre pour relations
|
| 576 |
+
self._chunk_registry[semantic_chunk.metadata.chunk_id] = semantic_chunk
|
| 577 |
+
|
| 578 |
+
except Exception as e:
|
| 579 |
+
logger.error(f"❌ Erreur chunking document {doc_idx}: {e}")
|
| 580 |
+
# Fallback vers chunking simple
|
| 581 |
+
fallback_chunks = await self._apply_llamaindex_hierarchical_chunking([document], request)
|
| 582 |
+
chunks.extend(fallback_chunks)
|
| 583 |
+
|
| 584 |
+
return chunks
|
| 585 |
+
|
| 586 |
+
def _convert_to_semantic_chunk_v4(self, chunk_result: ChunkResult,
|
| 587 |
+
doc_idx: int, request: ChunkRequest) -> SemanticChunk:
|
| 588 |
+
"""Conversion ChunkResult vers SemanticChunk avec enrichissement"""
|
| 589 |
+
|
| 590 |
+
# Génération métadonnées enrichies
|
| 591 |
+
metadata = ChunkMetadata(
|
| 592 |
+
chunk_id=chunk_result.id,
|
| 593 |
+
level=self._determine_chunk_level_v4(chunk_result.level),
|
| 594 |
+
parent_id=chunk_result.parent_id,
|
| 595 |
+
children_ids=chunk_result.children_ids,
|
| 596 |
+
tokens_count=len(chunk_result.text.split()),
|
| 597 |
+
source_title=request.titre or "Document",
|
| 598 |
+
source_id=request.source_id or f"doc_{doc_idx}",
|
| 599 |
+
confidence_score=chunk_result.semantic_similarity or 0.8
|
| 600 |
+
)
|
| 601 |
+
|
| 602 |
+
# Détection titre automatique
|
| 603 |
+
detected_title = self._extract_title_from_content(chunk_result.text)
|
| 604 |
+
if detected_title:
|
| 605 |
+
metadata.detected_title = detected_title
|
| 606 |
+
|
| 607 |
+
# Création SemanticChunk
|
| 608 |
+
semantic_chunk = SemanticChunk(
|
| 609 |
+
content=chunk_result.text,
|
| 610 |
+
title=detected_title or f"Chunk {chunk_result.id[:8]}",
|
| 611 |
+
metadata=metadata,
|
| 612 |
+
embedding_vector=chunk_result.embedding_vector
|
| 613 |
+
)
|
| 614 |
+
|
| 615 |
+
return semantic_chunk
|
| 616 |
+
|
| 617 |
+
def _extract_title_from_content(self, content: str) -> Optional[str]:
|
| 618 |
+
"""Extraction titre depuis le contenu"""
|
| 619 |
+
lines = content.strip().split('\n')
|
| 620 |
+
|
| 621 |
+
for line in lines[:3]: # Regarde les 3 premières lignes
|
| 622 |
+
line = line.strip()
|
| 623 |
+
|
| 624 |
+
# Headers Markdown
|
| 625 |
+
if re.match(r'^#{1,6}\s+', line):
|
| 626 |
+
return re.sub(r'^#{1,6}\s+', '', line).strip()
|
| 627 |
+
|
| 628 |
+
# Ligne courte probablement titre
|
| 629 |
+
if len(line) < 100 and len(line) > 10 and not line.endswith('.'):
|
| 630 |
+
return line
|
| 631 |
+
|
| 632 |
+
# Fallback: première phrase
|
| 633 |
+
sentences = re.split(r'[.!?]+', content)
|
| 634 |
+
if sentences and len(sentences[0]) < 150:
|
| 635 |
+
return sentences[0].strip()
|
| 636 |
+
|
| 637 |
+
return None
|
| 638 |
+
|
| 639 |
+
# ===================================
|
| 640 |
+
# CHUNKING SÉMANTIQUE CHONKIE
|
| 641 |
+
# ===================================
|
| 642 |
+
|
| 643 |
+
async def _apply_chonkie_semantic_chunking(self, chunks: List[SemanticChunk],
|
| 644 |
+
request: ChunkRequest) -> List[SemanticChunk]:
|
| 645 |
+
"""Chunking sémantique avec Chonkie SemanticChunker"""
|
| 646 |
+
|
| 647 |
+
if not self.chonkie_semantic:
|
| 648 |
+
return chunks
|
| 649 |
+
|
| 650 |
+
refined_chunks = []
|
| 651 |
+
|
| 652 |
+
for chunk in chunks:
|
| 653 |
+
try:
|
| 654 |
+
# Applique chunking sémantique Chonkie
|
| 655 |
+
semantic_parts = self.chonkie_semantic.chunk(chunk.content)
|
| 656 |
+
|
| 657 |
+
if len(semantic_parts) <= 1:
|
| 658 |
+
# Pas de subdivision sémantique nécessaire
|
| 659 |
+
refined_chunks.append(chunk)
|
| 660 |
+
else:
|
| 661 |
+
# Subdivision sémantique détectée
|
| 662 |
+
for i, part in enumerate(semantic_parts):
|
| 663 |
+
sub_chunk = SemanticChunk(
|
| 664 |
+
content=part,
|
| 665 |
+
title=f"{chunk.title} - Partie {i+1}",
|
| 666 |
+
metadata=ChunkMetadata(
|
| 667 |
+
chunk_id=f"{chunk.metadata.chunk_id}_sem_{i}",
|
| 668 |
+
level=chunk.metadata.level,
|
| 669 |
+
parent_id=chunk.metadata.chunk_id,
|
| 670 |
+
children_ids=[],
|
| 671 |
+
tokens_count=len(part.split()),
|
| 672 |
+
source_title=chunk.metadata.source_title,
|
| 673 |
+
source_id=chunk.metadata.source_id,
|
| 674 |
+
confidence_score=0.85,
|
| 675 |
+
chunk_type="semantic_subdivision"
|
| 676 |
+
)
|
| 677 |
+
)
|
| 678 |
+
refined_chunks.append(sub_chunk)
|
| 679 |
+
|
| 680 |
+
# Met à jour les relations parent-enfant
|
| 681 |
+
chunk.metadata.children_ids.append(sub_chunk.metadata.chunk_id)
|
| 682 |
+
self._chunk_registry[sub_chunk.metadata.chunk_id] = sub_chunk
|
| 683 |
+
|
| 684 |
+
except Exception as e:
|
| 685 |
+
logger.warning(f"⚠️ Erreur chunking sémantique Chonkie: {e}")
|
| 686 |
+
refined_chunks.append(chunk)
|
| 687 |
+
|
| 688 |
+
return refined_chunks
|
| 689 |
+
|
| 690 |
+
async def _apply_fallback_semantic_chunking(self, chunks: List[SemanticChunk],
|
| 691 |
+
request: ChunkRequest) -> List[SemanticChunk]:
|
| 692 |
+
"""Chunking sémantique fallback avec LlamaIndex"""
|
| 693 |
+
|
| 694 |
+
if not self.semantic_splitter:
|
| 695 |
+
return chunks
|
| 696 |
+
|
| 697 |
+
refined_chunks = []
|
| 698 |
+
|
| 699 |
+
for chunk in chunks:
|
| 700 |
+
try:
|
| 701 |
+
# Conversion vers TextNode pour LlamaIndex
|
| 702 |
+
text_node = TextNode(
|
| 703 |
+
text=chunk.content,
|
| 704 |
+
metadata=chunk.metadata.__dict__
|
| 705 |
+
)
|
| 706 |
+
|
| 707 |
+
# Applique chunking sémantique LlamaIndex
|
| 708 |
+
semantic_nodes = self.semantic_splitter.get_nodes_from_documents([text_node])
|
| 709 |
+
|
| 710 |
+
if len(semantic_nodes) <= 1:
|
| 711 |
+
refined_chunks.append(chunk)
|
| 712 |
+
else:
|
| 713 |
+
# Conversion retour vers SemanticChunk
|
| 714 |
+
for i, node in enumerate(semantic_nodes):
|
| 715 |
+
sub_chunk = SemanticChunk(
|
| 716 |
+
content=node.get_content(),
|
| 717 |
+
title=f"{chunk.title} - Segment {i+1}",
|
| 718 |
+
metadata=ChunkMetadata(
|
| 719 |
+
chunk_id=f"{chunk.metadata.chunk_id}_llama_{i}",
|
| 720 |
+
level=chunk.metadata.level,
|
| 721 |
+
parent_id=chunk.metadata.chunk_id,
|
| 722 |
+
children_ids=[],
|
| 723 |
+
tokens_count=len(node.get_content().split()),
|
| 724 |
+
source_title=chunk.metadata.source_title,
|
| 725 |
+
source_id=chunk.metadata.source_id,
|
| 726 |
+
confidence_score=0.75,
|
| 727 |
+
chunk_type="semantic_llamaindex"
|
| 728 |
+
)
|
| 729 |
+
)
|
| 730 |
+
refined_chunks.append(sub_chunk)
|
| 731 |
+
self._chunk_registry[sub_chunk.metadata.chunk_id] = sub_chunk
|
| 732 |
+
|
| 733 |
+
except Exception as e:
|
| 734 |
+
logger.warning(f"⚠️ Erreur chunking sémantique LlamaIndex: {e}")
|
| 735 |
+
refined_chunks.append(chunk)
|
| 736 |
+
|
| 737 |
+
return refined_chunks
|
| 738 |
+
|
| 739 |
+
async def _apply_llamaindex_hierarchical_chunking(self, documents: List[Document],
|
| 740 |
+
request: ChunkRequest) -> List[SemanticChunk]:
|
| 741 |
+
"""Chunking hiérarchique fallback avec LlamaIndex"""
|
| 742 |
+
|
| 743 |
+
chunks = []
|
| 744 |
+
|
| 745 |
+
for doc_idx, document in enumerate(documents):
|
| 746 |
+
try:
|
| 747 |
+
# Utilise HierarchicalNodeParser
|
| 748 |
+
if self.hierarchical_parser:
|
| 749 |
+
nodes = self.hierarchical_parser.get_nodes_from_documents([document])
|
| 750 |
+
else:
|
| 751 |
+
# Fallback vers SentenceSplitter
|
| 752 |
+
nodes = self.sentence_splitter.get_nodes_from_documents([document])
|
| 753 |
+
|
| 754 |
+
# Conversion vers SemanticChunk
|
| 755 |
+
for node_idx, node in enumerate(nodes):
|
| 756 |
+
chunk_id = self._generate_chunk_id_v4(
|
| 757 |
+
node.get_content(), doc_idx, 0, node_idx
|
| 758 |
+
)
|
| 759 |
+
|
| 760 |
+
metadata = ChunkMetadata(
|
| 761 |
+
chunk_id=chunk_id,
|
| 762 |
+
level=ChunkLevel.SECTION,
|
| 763 |
+
parent_id=None,
|
| 764 |
+
children_ids=[],
|
| 765 |
+
tokens_count=len(node.get_content().split()),
|
| 766 |
+
source_title=request.titre or "Document",
|
| 767 |
+
source_id=request.source_id or f"doc_{doc_idx}",
|
| 768 |
+
confidence_score=0.7,
|
| 769 |
+
chunk_type="llamaindex_hierarchical"
|
| 770 |
+
)
|
| 771 |
+
|
| 772 |
+
semantic_chunk = SemanticChunk(
|
| 773 |
+
content=node.get_content(),
|
| 774 |
+
title=self._extract_title_from_content(node.get_content()) or f"Chunk {chunk_id[:8]}",
|
| 775 |
+
metadata=metadata
|
| 776 |
+
)
|
| 777 |
+
|
| 778 |
+
chunks.append(semantic_chunk)
|
| 779 |
+
self._chunk_registry[chunk_id] = semantic_chunk
|
| 780 |
+
|
| 781 |
+
except Exception as e:
|
| 782 |
+
logger.error(f"❌ Erreur chunking LlamaIndex hiérarchique: {e}")
|
| 783 |
+
raise
|
| 784 |
+
|
| 785 |
+
return chunks
|
| 786 |
+
|
| 787 |
+
# ===================================
|
| 788 |
+
# RELATIONS BIDIRECTIONNELLES
|
| 789 |
+
# ===================================
|
| 790 |
+
|
| 791 |
+
async def _build_bidirectional_relationships_v4(self, chunks: List[SemanticChunk]) -> List[SemanticChunk]:
|
| 792 |
+
"""Construction relations bidirectionnelles complètes"""
|
| 793 |
+
|
| 794 |
+
logger.info(f"🔗 Construction relations bidirectionnelles pour {len(chunks)} chunks")
|
| 795 |
+
|
| 796 |
+
# Tri par niveau et position pour construction hiérarchie
|
| 797 |
+
chunks_by_level = {}
|
| 798 |
+
for chunk in chunks:
|
| 799 |
+
level = chunk.metadata.level.value if hasattr(chunk.metadata.level, 'value') else chunk.metadata.level
|
| 800 |
+
if level not in chunks_by_level:
|
| 801 |
+
chunks_by_level[level] = []
|
| 802 |
+
chunks_by_level[level].append(chunk)
|
| 803 |
+
|
| 804 |
+
# Construction relations horizontales (siblings)
|
| 805 |
+
for level, level_chunks in chunks_by_level.items():
|
| 806 |
+
level_chunks.sort(key=lambda x: x.metadata.chunk_id) # Tri stable
|
| 807 |
+
|
| 808 |
+
for i, chunk in enumerate(level_chunks):
|
| 809 |
+
# Relations siblings
|
| 810 |
+
if i > 0:
|
| 811 |
+
chunk.metadata.prev_id = level_chunks[i-1].metadata.chunk_id
|
| 812 |
+
if i < len(level_chunks) - 1:
|
| 813 |
+
chunk.metadata.next_id = level_chunks[i+1].metadata.chunk_id
|
| 814 |
+
|
| 815 |
+
# Construction relations verticales (parent-enfant)
|
| 816 |
+
for chunk in chunks:
|
| 817 |
+
# Relations enfants déjà établies par CustomRecursiveChunker
|
| 818 |
+
# Vérification cohérence relations parent
|
| 819 |
+
if chunk.metadata.parent_id and chunk.metadata.parent_id in self._chunk_registry:
|
| 820 |
+
parent_chunk = self._chunk_registry[chunk.metadata.parent_id]
|
| 821 |
+
if chunk.metadata.chunk_id not in parent_chunk.metadata.children_ids:
|
| 822 |
+
parent_chunk.metadata.children_ids.append(chunk.metadata.chunk_id)
|
| 823 |
+
|
| 824 |
+
logger.info("✅ Relations bidirectionnelles construites")
|
| 825 |
+
return chunks
|
| 826 |
+
|
| 827 |
+
# ===================================
|
| 828 |
+
# ENRICHISSEMENT INTELLIGENT
|
| 829 |
+
# ===================================
|
| 830 |
+
|
| 831 |
+
async def _enrich_with_intelligence_v4(self, chunks: List[SemanticChunk],
|
| 832 |
+
request: ChunkRequest) -> List[SemanticChunk]:
|
| 833 |
+
"""Extraction concepts et métadonnées intelligentes"""
|
| 834 |
+
|
| 835 |
+
logger.info(f"🧠 Enrichissement intelligent de {len(chunks)} chunks")
|
| 836 |
+
|
| 837 |
+
for chunk in chunks:
|
| 838 |
+
# Extraction keywords et concepts
|
| 839 |
+
await self._extract_keywords_and_concepts_v4(chunk)
|
| 840 |
+
|
| 841 |
+
# Classification type de chunk
|
| 842 |
+
chunk.metadata.chunk_type = self._classify_chunk_type_v4(chunk.content)
|
| 843 |
+
|
| 844 |
+
# Score de confiance ajusté
|
| 845 |
+
chunk.metadata.confidence_score = self._calculate_confidence_score_v4(chunk)
|
| 846 |
+
|
| 847 |
+
logger.info("✅ Enrichissement intelligent terminé")
|
| 848 |
+
return chunks
|
| 849 |
+
|
| 850 |
+
async def _extract_keywords_and_concepts_v4(self, chunk: SemanticChunk):
|
| 851 |
+
"""Extraction keywords et concepts simples mais efficaces"""
|
| 852 |
+
|
| 853 |
+
# Tokenisation simple
|
| 854 |
+
words = re.findall(r'\b[a-zA-ZÀ-ÿ]{3,}\b', chunk.content.lower())
|
| 855 |
+
|
| 856 |
+
# Stop words français basiques
|
| 857 |
+
stop_words = {
|
| 858 |
+
'le', 'de', 'un', 'à', 'être', 'et', 'en', 'avoir', 'que', 'pour',
|
| 859 |
+
'dans', 'ce', 'il', 'une', 'sur', 'avec', 'ne', 'se', 'pas', 'tout',
|
| 860 |
+
'plus', 'par', 'grand', 'comme', 'même', 'temps', 'très', 'bien',
|
| 861 |
+
'où', 'sans', 'entre', 'sous', 'deux', 'aussi', 'ces', 'son', 'peut'
|
| 862 |
+
}
|
| 863 |
+
|
| 864 |
+
# Filtrage et comptage
|
| 865 |
+
words = [w for w in words if w not in stop_words and len(w) > 2]
|
| 866 |
+
|
| 867 |
+
# Comptage fréquences
|
| 868 |
+
word_counts = collections.Counter(words)
|
| 869 |
+
top_words = [word for word, count in word_counts.most_common(5)]
|
| 870 |
+
|
| 871 |
+
# Extraction concepts simples (mots capitalisés ou répétés)
|
| 872 |
+
concept_candidates = re.findall(r'\b[A-ZÀ-Ÿ][a-zA-ZÀ-ÿ]{4,}\b', chunk.content)
|
| 873 |
+
concepts = list(set(concept_candidates))[:3]
|
| 874 |
+
|
| 875 |
+
# Mise à jour
|
| 876 |
+
chunk.metadata.keywords = top_words
|
| 877 |
+
chunk.metadata.main_concepts = concepts if concepts else top_words[:3]
|
| 878 |
+
|
| 879 |
+
def _classify_chunk_type_v4(self, content: str) -> str:
|
| 880 |
+
"""Classification type de chunk basée sur patterns"""
|
| 881 |
+
|
| 882 |
+
# Détection patterns spécifiques
|
| 883 |
+
if re.search(r'^\s*#{1,6}\s+', content, re.MULTILINE):
|
| 884 |
+
return "header"
|
| 885 |
+
elif re.search(r'^\s*\d+\.\s+', content, re.MULTILINE):
|
| 886 |
+
return "numbered_list"
|
| 887 |
+
elif re.search(r'^\s*[*-+]\s+', content, re.MULTILINE):
|
| 888 |
+
return "bullet_list"
|
| 889 |
+
elif re.search(r'\b(définition|concept|principe)\b', content.lower()):
|
| 890 |
+
return "definition"
|
| 891 |
+
elif re.search(r'\b(exemple|illustration|cas)\b', content.lower()):
|
| 892 |
+
return "example"
|
| 893 |
+
elif re.search(r'\b(méthode|procédure|étape)\b', content.lower()):
|
| 894 |
+
return "method"
|
| 895 |
+
else:
|
| 896 |
+
return "concept"
|
| 897 |
+
|
| 898 |
+
def _calculate_confidence_score_v4(self, chunk: SemanticChunk) -> float:
|
| 899 |
+
"""Calcul score de confiance basé sur plusieurs facteurs"""
|
| 900 |
+
|
| 901 |
+
score = 0.5 # Score base
|
| 902 |
+
|
| 903 |
+
# Facteurs positifs
|
| 904 |
+
if chunk.metadata.keywords and len(chunk.metadata.keywords) >= 3:
|
| 905 |
+
score += 0.1
|
| 906 |
+
|
| 907 |
+
if chunk.metadata.main_concepts and len(chunk.metadata.main_concepts) >= 2:
|
| 908 |
+
score += 0.1
|
| 909 |
+
|
| 910 |
+
if chunk.metadata.detected_title:
|
| 911 |
+
score += 0.1
|
| 912 |
+
|
| 913 |
+
if len(chunk.content.split()) >= 20: # Chunk substantiel
|
| 914 |
+
score += 0.1
|
| 915 |
+
|
| 916 |
+
if chunk.metadata.children_ids: # A des enfants
|
| 917 |
+
score += 0.1
|
| 918 |
+
|
| 919 |
+
# Facteurs négatifs
|
| 920 |
+
if len(chunk.content.split()) < 10: # Chunk trop court
|
| 921 |
+
score -= 0.2
|
| 922 |
+
|
| 923 |
+
return max(0.1, min(1.0, score))
|
| 924 |
+
|
| 925 |
+
def _determine_chunk_level_v4(self, level: int) -> ChunkLevel:
|
| 926 |
+
"""Mapping niveau vers ChunkLevel enum v4.0"""
|
| 927 |
+
mapping = {
|
| 928 |
+
0: ChunkLevel.DOCUMENT,
|
| 929 |
+
1: ChunkLevel.CHAPTER,
|
| 930 |
+
2: ChunkLevel.SECTION,
|
| 931 |
+
3: ChunkLevel.SUBSECTION,
|
| 932 |
+
4: ChunkLevel.CONCEPT,
|
| 933 |
+
5: ChunkLevel.DETAIL
|
| 934 |
+
}
|
| 935 |
+
return mapping.get(level, ChunkLevel.DETAIL)
|
| 936 |
+
|
| 937 |
+
def _generate_chunk_id_v4(self, text: str, doc_idx: int, level: int, node_idx: int) -> str:
|
| 938 |
+
"""Génération ID unique traçable v4.0"""
|
| 939 |
+
content_hash = hashlib.md5(text.encode()).hexdigest()[:8]
|
| 940 |
+
timestamp = int(time.time()) % 10000
|
| 941 |
+
return f"chk_{doc_idx:02d}_{level}_{node_idx:03d}_{content_hash}_{timestamp}"
|
| 942 |
+
|
| 943 |
+
# ===================================
|
| 944 |
+
# GÉNÉRATION EXPORTS
|
| 945 |
+
# ===================================
|
| 946 |
+
|
| 947 |
+
async def _generate_exports_v4(self, chunks: List[SemanticChunk], request: ChunkRequest) -> Dict[str, Any]:
|
| 948 |
+
"""Génération exports Second Cerveau et Agents v4.0"""
|
| 949 |
+
exports = {}
|
| 950 |
+
|
| 951 |
+
# Export Obsidian avec format corrigé
|
| 952 |
+
exports["obsidian"] = await self._generate_obsidian_export_v4(chunks, request)
|
| 953 |
+
|
| 954 |
+
# Export Agents spécialisés
|
| 955 |
+
exports["agents"] = await self._generate_agent_knowledge_v4(chunks, request)
|
| 956 |
+
|
| 957 |
+
# Export graphe concepts
|
| 958 |
+
exports["concept_graph"] = self._extract_concept_graph_v4(chunks)
|
| 959 |
+
|
| 960 |
+
return exports
|
| 961 |
+
|
| 962 |
+
async def _generate_obsidian_export_v4(self, chunks: List[SemanticChunk], request: ChunkRequest) -> Dict[str, Any]:
|
| 963 |
+
"""✅ Export Obsidian avec format [[Titre]], id corrigé v4.0"""
|
| 964 |
+
obsidian_config = self.config.get("obsidian", {})
|
| 965 |
+
parent_format = obsidian_config.get("parent_format", "[[{title}]], {id}")
|
| 966 |
+
|
| 967 |
+
notes = []
|
| 968 |
+
for chunk in chunks:
|
| 969 |
+
# ✅ Format parent corrigé selon tes spécifications
|
| 970 |
+
parent_link = None
|
| 971 |
+
if chunk.metadata.parent_id:
|
| 972 |
+
parent_title = self._get_chunk_title_by_id(chunk.metadata.parent_id)
|
| 973 |
+
parent_link = parent_format.format(
|
| 974 |
+
title=parent_title,
|
| 975 |
+
id=chunk.metadata.parent_id
|
| 976 |
+
)
|
| 977 |
+
|
| 978 |
+
# Construction du contenu de la note Obsidian
|
| 979 |
+
note_content = []
|
| 980 |
+
|
| 981 |
+
# En-tête avec métadonnées
|
| 982 |
+
note_content.append("---")
|
| 983 |
+
note_content.append(f"id: {chunk.metadata.chunk_id}")
|
| 984 |
+
note_content.append(f"type: {chunk.metadata.chunk_type}")
|
| 985 |
+
note_content.append(f"level: {chunk.metadata.level}")
|
| 986 |
+
note_content.append(f"source: {chunk.metadata.source_title}")
|
| 987 |
+
if chunk.metadata.keywords:
|
| 988 |
+
note_content.append(f"keywords: {', '.join(chunk.metadata.keywords)}")
|
| 989 |
+
if chunk.metadata.main_concepts:
|
| 990 |
+
note_content.append(f"concepts: {', '.join(chunk.metadata.main_concepts)}")
|
| 991 |
+
note_content.append("---")
|
| 992 |
+
note_content.append("")
|
| 993 |
+
|
| 994 |
+
# Liens de navigation
|
| 995 |
+
if parent_link:
|
| 996 |
+
note_content.append(f"**Parent:** {parent_link}")
|
| 997 |
+
|
| 998 |
+
if chunk.metadata.children_ids:
|
| 999 |
+
children_links = []
|
| 1000 |
+
for child_id in chunk.metadata.children_ids:
|
| 1001 |
+
child_title = self._get_chunk_title_by_id(child_id)
|
| 1002 |
+
child_link = parent_format.format(title=child_title, id=child_id)
|
| 1003 |
+
children_links.append(child_link)
|
| 1004 |
+
note_content.append(f"**Enfants:** {', '.join(children_links)}")
|
| 1005 |
+
|
| 1006 |
+
# Liens siblings
|
| 1007 |
+
if hasattr(chunk.metadata, 'prev_id') and chunk.metadata.prev_id:
|
| 1008 |
+
prev_title = self._get_chunk_title_by_id(chunk.metadata.prev_id)
|
| 1009 |
+
prev_link = parent_format.format(title=prev_title, id=chunk.metadata.prev_id)
|
| 1010 |
+
note_content.append(f"**Précédent:** {prev_link}")
|
| 1011 |
+
|
| 1012 |
+
if hasattr(chunk.metadata, 'next_id') and chunk.metadata.next_id:
|
| 1013 |
+
next_title = self._get_chunk_title_by_id(chunk.metadata.next_id)
|
| 1014 |
+
next_link = parent_format.format(title=next_title, id=chunk.metadata.next_id)
|
| 1015 |
+
note_content.append(f"**Suivant:** {next_link}")
|
| 1016 |
+
|
| 1017 |
+
note_content.append("")
|
| 1018 |
+
note_content.append("---")
|
| 1019 |
+
note_content.append("")
|
| 1020 |
+
|
| 1021 |
+
# Contenu principal
|
| 1022 |
+
note_content.append("## Contenu")
|
| 1023 |
+
note_content.append("")
|
| 1024 |
+
note_content.append(chunk.content)
|
| 1025 |
+
|
| 1026 |
+
# Note finale
|
| 1027 |
+
note = {
|
| 1028 |
+
"filename": f"{chunk.metadata.chunk_id}.md",
|
| 1029 |
+
"title": chunk.title,
|
| 1030 |
+
"content": "\n".join(note_content),
|
| 1031 |
+
"metadata": {
|
| 1032 |
+
"id": chunk.metadata.chunk_id,
|
| 1033 |
+
"level": chunk.metadata.level,
|
| 1034 |
+
"parent_id": chunk.metadata.parent_id,
|
| 1035 |
+
"children_count": len(chunk.metadata.children_ids),
|
| 1036 |
+
"keywords": chunk.metadata.keywords,
|
| 1037 |
+
"concepts": chunk.metadata.main_concepts,
|
| 1038 |
+
"confidence": chunk.metadata.confidence_score
|
| 1039 |
+
}
|
| 1040 |
+
}
|
| 1041 |
+
notes.append(note)
|
| 1042 |
+
|
| 1043 |
+
return {
|
| 1044 |
+
"format": "obsidian_vault_v4",
|
| 1045 |
+
"version": "4.0.0",
|
| 1046 |
+
"notes": notes,
|
| 1047 |
+
"vault_config": {
|
| 1048 |
+
"name": f"Vault_{request.source_id or 'default'}",
|
| 1049 |
+
"bidirectional_links": obsidian_config.get("use_bidirectional_links", True),
|
| 1050 |
+
"parent_format": parent_format
|
| 1051 |
+
},
|
| 1052 |
+
"statistics": {
|
| 1053 |
+
"total_notes": len(notes),
|
| 1054 |
+
"total_concepts": len(set(c for chunk in chunks for c in chunk.metadata.main_concepts or [])),
|
| 1055 |
+
"hierarchy_levels": len(set(chunk.metadata.level for chunk in chunks))
|
| 1056 |
+
}
|
| 1057 |
+
}
|
| 1058 |
+
|
| 1059 |
+
def _get_chunk_title_by_id(self, chunk_id: str) -> str:
|
| 1060 |
+
"""Récupération titre chunk par ID pour liens Obsidian"""
|
| 1061 |
+
if chunk_id in self._chunk_registry:
|
| 1062 |
+
chunk = self._chunk_registry[chunk_id]
|
| 1063 |
+
return chunk.title or chunk.metadata.detected_title or f"Chunk {chunk_id[:8]}"
|
| 1064 |
+
return f"Chunk {chunk_id[:8]}"
|
| 1065 |
+
|
| 1066 |
+
async def _generate_agent_knowledge_v4(self, chunks: List[SemanticChunk], request: ChunkRequest) -> Dict[str, Any]:
|
| 1067 |
+
"""Génération base connaissance agents spécialisés v4.0"""
|
| 1068 |
+
|
| 1069 |
+
# Classification par type pour agents
|
| 1070 |
+
knowledge_base = {
|
| 1071 |
+
"principles": [],
|
| 1072 |
+
"methods": [],
|
| 1073 |
+
"examples": [],
|
| 1074 |
+
"concepts": [],
|
| 1075 |
+
"frameworks": [],
|
| 1076 |
+
"definitions": []
|
| 1077 |
+
}
|
| 1078 |
+
|
| 1079 |
+
for chunk in chunks:
|
| 1080 |
+
chunk_type = chunk.metadata.chunk_type or "concept"
|
| 1081 |
+
|
| 1082 |
+
knowledge_item = {
|
| 1083 |
+
"id": chunk.metadata.chunk_id,
|
| 1084 |
+
"content": chunk.content,
|
| 1085 |
+
"concepts": chunk.metadata.main_concepts,
|
| 1086 |
+
"keywords": chunk.metadata.keywords,
|
| 1087 |
+
"confidence": chunk.metadata.confidence_score,
|
| 1088 |
+
"level": chunk.metadata.level,
|
| 1089 |
+
"source": chunk.metadata.source_title,
|
| 1090 |
+
"detected_title": chunk.metadata.detected_title,
|
| 1091 |
+
"relations": {
|
| 1092 |
+
"parent": chunk.metadata.parent_id,
|
| 1093 |
+
"children": chunk.metadata.children_ids,
|
| 1094 |
+
"siblings": [getattr(chunk.metadata, 'prev_id', None), getattr(chunk.metadata, 'next_id', None)]
|
| 1095 |
+
}
|
| 1096 |
+
}
|
| 1097 |
+
|
| 1098 |
+
# Classification intelligente pour agents
|
| 1099 |
+
if chunk_type in ["definition", "concept"]:
|
| 1100 |
+
knowledge_base["concepts"].append(knowledge_item)
|
| 1101 |
+
elif chunk_type in ["method", "procedure"]:
|
| 1102 |
+
knowledge_base["methods"].append(knowledge_item)
|
| 1103 |
+
elif chunk_type == "example":
|
| 1104 |
+
knowledge_base["examples"].append(knowledge_item)
|
| 1105 |
+
elif "principe" in chunk.content.lower():
|
| 1106 |
+
knowledge_base["principles"].append(knowledge_item)
|
| 1107 |
+
elif any(fw in chunk.content.lower() for fw in ["framework", "cadre", "modèle"]):
|
| 1108 |
+
knowledge_base["frameworks"].append(knowledge_item)
|
| 1109 |
+
else:
|
| 1110 |
+
knowledge_base["concepts"].append(knowledge_item)
|
| 1111 |
+
|
| 1112 |
+
return {
|
| 1113 |
+
"format": "agent_knowledge_v4",
|
| 1114 |
+
"version": "4.0.0",
|
| 1115 |
+
"knowledge_base": knowledge_base,
|
| 1116 |
+
"statistics": {
|
| 1117 |
+
"total_items": sum(len(items) for items in knowledge_base.values()),
|
| 1118 |
+
"by_type": {k: len(v) for k, v in knowledge_base.items()},
|
| 1119 |
+
"confidence_avg": sum(chunk.metadata.confidence_score for chunk in chunks) / len(chunks) if chunks else 0
|
| 1120 |
+
},
|
| 1121 |
+
"metadata": {
|
| 1122 |
+
"source": request.source_id or "unknown",
|
| 1123 |
+
"title": request.titre or "Document",
|
| 1124 |
+
"generated_at": time.time()
|
| 1125 |
+
}
|
| 1126 |
+
}
|
| 1127 |
+
|
| 1128 |
+
def _extract_concept_graph_v4(self, chunks: List[SemanticChunk]) -> Dict[str, Any]:
|
| 1129 |
+
"""Extraction graphe de concepts pour visualisation"""
|
| 1130 |
+
|
| 1131 |
+
nodes = []
|
| 1132 |
+
edges = []
|
| 1133 |
+
|
| 1134 |
+
# Extraction nodes (chunks)
|
| 1135 |
+
for chunk in chunks:
|
| 1136 |
+
node = {
|
| 1137 |
+
"id": chunk.metadata.chunk_id,
|
| 1138 |
+
"label": chunk.title,
|
| 1139 |
+
"type": chunk.metadata.chunk_type,
|
| 1140 |
+
"level": chunk.metadata.level,
|
| 1141 |
+
"concepts": chunk.metadata.main_concepts or [],
|
| 1142 |
+
"keywords": chunk.metadata.keywords or [],
|
| 1143 |
+
"confidence": chunk.metadata.confidence_score
|
| 1144 |
+
}
|
| 1145 |
+
nodes.append(node)
|
| 1146 |
+
|
| 1147 |
+
# Extraction edges (relations)
|
| 1148 |
+
for chunk in chunks:
|
| 1149 |
+
# Relations parent-enfant
|
| 1150 |
+
if chunk.metadata.parent_id:
|
| 1151 |
+
edges.append({
|
| 1152 |
+
"source": chunk.metadata.parent_id,
|
| 1153 |
+
"target": chunk.metadata.chunk_id,
|
| 1154 |
+
"type": "parent_child",
|
| 1155 |
+
"weight": 1.0
|
| 1156 |
+
})
|
| 1157 |
+
|
| 1158 |
+
# Relations siblings
|
| 1159 |
+
if hasattr(chunk.metadata, 'next_id') and chunk.metadata.next_id:
|
| 1160 |
+
edges.append({
|
| 1161 |
+
"source": chunk.metadata.chunk_id,
|
| 1162 |
+
"target": chunk.metadata.next_id,
|
| 1163 |
+
"type": "sequential",
|
| 1164 |
+
"weight": 0.5
|
| 1165 |
+
})
|
| 1166 |
+
|
| 1167 |
+
return {
|
| 1168 |
+
"format": "concept_graph_v4",
|
| 1169 |
+
"nodes": nodes,
|
| 1170 |
+
"edges": edges,
|
| 1171 |
+
"statistics": {
|
| 1172 |
+
"total_nodes": len(nodes),
|
| 1173 |
+
"total_edges": len(edges),
|
| 1174 |
+
"levels": len(set(node["level"] for node in nodes))
|
| 1175 |
+
}
|
| 1176 |
+
}
|
| 1177 |
+
|
| 1178 |
+
# ===================================
|
| 1179 |
+
# UTILITAIRES ET HELPERS
|
| 1180 |
+
# ===================================
|
| 1181 |
+
|
| 1182 |
+
def _build_hierarchy_levels_v4(self, chunks: List[SemanticChunk]) -> Dict[str, Any]:
|
| 1183 |
+
"""Construction hiérarchie organisée par niveaux"""
|
| 1184 |
+
|
| 1185 |
+
hierarchy = {"levels": {}, "total_chunks": len(chunks), "root_chunks": []}
|
| 1186 |
+
|
| 1187 |
+
for chunk in chunks:
|
| 1188 |
+
level = chunk.metadata.level
|
| 1189 |
+
if level not in hierarchy["levels"]:
|
| 1190 |
+
hierarchy["levels"][level] = []
|
| 1191 |
+
|
| 1192 |
+
hierarchy["levels"][level].append({
|
| 1193 |
+
"id": chunk.metadata.chunk_id,
|
| 1194 |
+
"title": chunk.title,
|
| 1195 |
+
"parent_id": chunk.metadata.parent_id,
|
| 1196 |
+
"children_ids": chunk.metadata.children_ids,
|
| 1197 |
+
"confidence": chunk.metadata.confidence_score
|
| 1198 |
+
})
|
| 1199 |
+
|
| 1200 |
+
# Chunks racine (sans parent)
|
| 1201 |
+
if not chunk.metadata.parent_id:
|
| 1202 |
+
hierarchy["root_chunks"].append(chunk.metadata.chunk_id)
|
| 1203 |
+
|
| 1204 |
+
return hierarchy
|
| 1205 |
+
|
| 1206 |
+
def _build_source_metadata_v4(self, request: ChunkRequest) -> Dict[str, Any]:
|
| 1207 |
+
"""Construction métadonnées source"""
|
| 1208 |
+
|
| 1209 |
+
return {
|
| 1210 |
+
"title": request.titre or "Document sans titre",
|
| 1211 |
+
"source_id": request.source_id or "unknown",
|
| 1212 |
+
"text_length": len(request.text),
|
| 1213 |
+
"processing_options": {
|
| 1214 |
+
"include_metadata": request.include_metadata,
|
| 1215 |
+
"export_obsidian": request.export_obsidian,
|
| 1216 |
+
"export_agents": request.export_agents
|
| 1217 |
+
},
|
| 1218 |
+
"timestamp": time.time()
|
| 1219 |
+
}
|
| 1220 |
+
|
| 1221 |
+
async def _cleanup_memory_v4(self):
|
| 1222 |
+
"""Nettoyage mémoire pour HF Space"""
|
| 1223 |
+
try:
|
| 1224 |
+
gc.collect()
|
| 1225 |
+
if torch.cuda.is_available():
|
| 1226 |
+
torch.cuda.empty_cache()
|
| 1227 |
+
logger.debug("🧹 Nettoyage mémoire effectué")
|
| 1228 |
+
except Exception as e:
|
| 1229 |
+
logger.warning(f"⚠️ Erreur nettoyage mémoire: {e}")
|
| 1230 |
+
|
| 1231 |
+
# ===================================
|
| 1232 |
+
# MÉTHODES COMPATIBILITÉ API
|
| 1233 |
+
# ===================================
|
| 1234 |
+
|
| 1235 |
+
async def chunk_text(self, text: str, metadata: Optional[Dict[str, Any]] = None,
|
| 1236 |
+
method: str = "custom_recursive") -> List[Dict[str, Any]]:
|
| 1237 |
+
"""Point d'entrée compatible avec l'API simple"""
|
| 1238 |
+
|
| 1239 |
+
request = ChunkRequest(
|
| 1240 |
+
text=text,
|
| 1241 |
+
titre=metadata.get("title") if metadata else None,
|
| 1242 |
+
source_id=metadata.get("source_id") if metadata else None,
|
| 1243 |
+
include_metadata=True,
|
| 1244 |
+
export_obsidian=False,
|
| 1245 |
+
export_agents=False,
|
| 1246 |
+
metadata=metadata
|
| 1247 |
+
)
|
| 1248 |
+
|
| 1249 |
+
response = await self.process_text(request)
|
| 1250 |
+
|
| 1251 |
+
# Conversion vers format API simple
|
| 1252 |
+
return [
|
| 1253 |
+
{
|
| 1254 |
+
"id": chunk.metadata.chunk_id,
|
| 1255 |
+
"text": chunk.content,
|
| 1256 |
+
"title": chunk.title,
|
| 1257 |
+
"level": chunk.metadata.level,
|
| 1258 |
+
"parent_id": chunk.metadata.parent_id,
|
| 1259 |
+
"children_ids": chunk.metadata.children_ids,
|
| 1260 |
+
"metadata": {
|
| 1261 |
+
"keywords": chunk.metadata.keywords,
|
| 1262 |
+
"concepts": chunk.metadata.main_concepts,
|
| 1263 |
+
"confidence": chunk.metadata.confidence_score,
|
| 1264 |
+
"chunk_type": chunk.metadata.chunk_type
|
| 1265 |
+
},
|
| 1266 |
+
"has_embedding": chunk.embedding_vector is not None
|
| 1267 |
+
}
|
| 1268 |
+
for chunk in response.chunks
|
| 1269 |
+
]
|
| 1270 |
+
|
| 1271 |
+
async def chunk_with_hierarchy(self, text: str, metadata: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
|
| 1272 |
+
"""Point d'entrée pour chunking hiérarchique complet"""
|
| 1273 |
+
|
| 1274 |
+
request = ChunkRequest(
|
| 1275 |
+
text=text,
|
| 1276 |
+
titre=metadata.get("title") if metadata else None,
|
| 1277 |
+
source_id=metadata.get("source_id") if metadata else None,
|
| 1278 |
+
include_metadata=True,
|
| 1279 |
+
export_obsidian=True,
|
| 1280 |
+
export_agents=True,
|
| 1281 |
+
metadata=metadata
|
| 1282 |
+
)
|
| 1283 |
+
|
| 1284 |
+
response = await self.process_text(request)
|
| 1285 |
+
|
| 1286 |
+
return {
|
| 1287 |
+
"hierarchy": response.hierarchy,
|
| 1288 |
+
"total_chunks": response.total_chunks,
|
| 1289 |
+
"total_levels": len(response.hierarchy["levels"]),
|
| 1290 |
+
"root_chunks": response.hierarchy["root_chunks"],
|
| 1291 |
+
"obsidian_export": response.obsidian_export,
|
| 1292 |
+
"agent_knowledge": response.agent_knowledge,
|
| 1293 |
+
"concept_graph": response.concept_graph
|
| 1294 |
+
}
|
| 1295 |
+
|
| 1296 |
+
def export_to_obsidian(self, chunks: Union[List[SemanticChunk], List[Dict[str, Any]]],
|
| 1297 |
+
source_title: str = "Document",
|
| 1298 |
+
output_path: Optional[str] = None) -> str:
|
| 1299 |
+
"""Export vers fichier Obsidian"""
|
| 1300 |
+
|
| 1301 |
+
# Si chunks au format dict, reconvertit vers SemanticChunk
|
| 1302 |
+
if chunks and isinstance(chunks[0], dict):
|
| 1303 |
+
semantic_chunks = []
|
| 1304 |
+
for chunk_dict in chunks:
|
| 1305 |
+
metadata = ChunkMetadata(
|
| 1306 |
+
chunk_id=chunk_dict.get("id"),
|
| 1307 |
+
level=chunk_dict.get("level", ChunkLevel.CONCEPT),
|
| 1308 |
+
parent_id=chunk_dict.get("parent_id"),
|
| 1309 |
+
children_ids=chunk_dict.get("children_ids", []),
|
| 1310 |
+
tokens_count=len(chunk_dict.get("text", "").split()),
|
| 1311 |
+
source_title=source_title,
|
| 1312 |
+
source_id="export",
|
| 1313 |
+
confidence_score=chunk_dict.get("metadata", {}).get("confidence", 0.8),
|
| 1314 |
+
keywords=chunk_dict.get("metadata", {}).get("keywords", []),
|
| 1315 |
+
main_concepts=chunk_dict.get("metadata", {}).get("concepts", []),
|
| 1316 |
+
chunk_type=chunk_dict.get("metadata", {}).get("chunk_type", "concept")
|
| 1317 |
+
)
|
| 1318 |
+
|
| 1319 |
+
semantic_chunk = SemanticChunk(
|
| 1320 |
+
content=chunk_dict.get("text", ""),
|
| 1321 |
+
title=chunk_dict.get("title", "Chunk"),
|
| 1322 |
+
metadata=metadata
|
| 1323 |
+
)
|
| 1324 |
+
semantic_chunks.append(semantic_chunk)
|
| 1325 |
+
chunks = semantic_chunks
|
| 1326 |
+
|
| 1327 |
+
# Génération contenu Obsidian
|
| 1328 |
+
if self.custom_recursive_chunker:
|
| 1329 |
+
# Conversion vers ChunkResult pour l'export
|
| 1330 |
+
chunk_results = []
|
| 1331 |
+
for chunk in chunks:
|
| 1332 |
+
chunk_result = ChunkResult(
|
| 1333 |
+
id=chunk.metadata.chunk_id,
|
| 1334 |
+
text=chunk.content,
|
| 1335 |
+
level=0, # Simplifié pour l'export
|
| 1336 |
+
parent_id=chunk.metadata.parent_id,
|
| 1337 |
+
children_ids=chunk.metadata.children_ids,
|
| 1338 |
+
metadata=chunk.metadata.__dict__
|
| 1339 |
+
)
|
| 1340 |
+
chunk_results.append(chunk_result)
|
| 1341 |
+
|
| 1342 |
+
obsidian_content = self.custom_recursive_chunker.to_obsidian_format(chunk_results, source_title)
|
| 1343 |
+
else:
|
| 1344 |
+
# Fallback simple
|
| 1345 |
+
obsidian_content = f"# {source_title}\n\n"
|
| 1346 |
+
for chunk in chunks:
|
| 1347 |
+
obsidian_content += f"## {chunk.title}\n\n{chunk.content}\n\n---\n\n"
|
| 1348 |
+
|
| 1349 |
+
# Sauvegarde si chemin spécifié
|
| 1350 |
+
if output_path:
|
| 1351 |
+
try:
|
| 1352 |
+
output_file = Path(output_path)
|
| 1353 |
+
output_file.parent.mkdir(parents=True, exist_ok=True)
|
| 1354 |
+
|
| 1355 |
+
with open(output_file, 'w', encoding='utf-8') as f:
|
| 1356 |
+
f.write(obsidian_content)
|
| 1357 |
+
|
| 1358 |
+
logger.info(f"✅ Export Obsidian sauvegardé: {output_path}")
|
| 1359 |
+
|
| 1360 |
+
except Exception as e:
|
| 1361 |
+
logger.error(f"❌ Erreur sauvegarde Obsidian: {e}")
|
| 1362 |
+
|
| 1363 |
+
return obsidian_content
|
| 1364 |
+
|
| 1365 |
+
async def health_check(self) -> Dict[str, Any]:
|
| 1366 |
+
"""Vérification de l'état du pipeline"""
|
| 1367 |
+
health_status = {
|
| 1368 |
+
"initialized": self._is_initialized,
|
| 1369 |
+
"components": {
|
| 1370 |
+
"llm": self.llm is not None,
|
| 1371 |
+
"embed_model": self.embed_model is not None,
|
| 1372 |
+
"custom_recursive_chunker": self.custom_recursive_chunker is not None,
|
| 1373 |
+
"chonkie_semantic": self.chonkie_semantic is not None,
|
| 1374 |
+
"sentence_splitter": self.sentence_splitter is not None
|
| 1375 |
+
},
|
| 1376 |
+
"capabilities": {
|
| 1377 |
+
"custom_recursive": self.custom_recursive_chunker is not None,
|
| 1378 |
+
"chonkie_semantic": CHONKIE_AVAILABLE and self.chonkie_semantic is not None,
|
| 1379 |
+
"llamaindex_fallback": self.sentence_splitter is not None,
|
| 1380 |
+
"obsidian_export": True,
|
| 1381 |
+
"agent_knowledge": True,
|
| 1382 |
+
"bidirectional_relations": True
|
| 1383 |
+
},
|
| 1384 |
+
"config_loaded": bool(self.config),
|
| 1385 |
+
"chonkie_available": CHONKIE_AVAILABLE
|
| 1386 |
+
}
|
| 1387 |
+
|
| 1388 |
+
# Test rapide si initialisé
|
| 1389 |
+
if self._is_initialized and self.custom_recursive_chunker:
|
| 1390 |
+
try:
|
| 1391 |
+
test_chunks = await self.chunk_text("Test de santé du pipeline v4.0.", method="custom_recursive")
|
| 1392 |
+
health_status["test_chunking"] = len(test_chunks) > 0
|
| 1393 |
+
except Exception as e:
|
| 1394 |
+
health_status["test_chunking"] = False
|
| 1395 |
+
health_status["test_error"] = str(e)
|
| 1396 |
+
|
| 1397 |
+
return health_status
|
| 1398 |
+
|
| 1399 |
+
def get_available_methods(self) -> List[str]:
|
| 1400 |
+
"""Retourne les méthodes de chunking disponibles"""
|
| 1401 |
+
methods = []
|
| 1402 |
+
|
| 1403 |
+
if self.custom_recursive_chunker:
|
| 1404 |
+
methods.append("custom_recursive")
|
| 1405 |
+
|
| 1406 |
+
if CHONKIE_AVAILABLE and self.chonkie_semantic:
|
| 1407 |
+
methods.append("chonkie_semantic")
|
| 1408 |
+
|
| 1409 |
+
if self.sentence_splitter:
|
| 1410 |
+
methods.append("llamaindex")
|
| 1411 |
+
|
| 1412 |
+
return methods
|
| 1413 |
+
|
| 1414 |
+
def get_config_summary(self) -> Dict[str, Any]:
|
| 1415 |
+
"""Retourne un résumé de la configuration active"""
|
| 1416 |
+
return {
|
| 1417 |
+
"models": {
|
| 1418 |
+
"llm_model": self.config.get("models", {}).get("llm", {}).get("model_name"),
|
| 1419 |
+
"embedding_model": self.config.get("models", {}).get("embedding", {}).get("model_name")
|
| 1420 |
+
},
|
| 1421 |
+
"chunking_methods": self.get_available_methods(),
|
| 1422 |
+
"custom_recursive_config": self.config.get("chunking", {}).get("custom_recursive", {}),
|
| 1423 |
+
"chonkie_available": CHONKIE_AVAILABLE,
|
| 1424 |
+
"obsidian_config": self.config.get("obsidian", {}),
|
| 1425 |
+
"features": {
|
| 1426 |
+
"bidirectional_relations": True,
|
| 1427 |
+
"intelligent_enrichment": True,
|
| 1428 |
+
"concept_extraction": True,
|
| 1429 |
+
"agent_knowledge_export": True,
|
| 1430 |
+
"obsidian_vault_export": True,
|
| 1431 |
+
"semantic_similarity": True
|
| 1432 |
+
}
|
| 1433 |
+
}
|
| 1434 |
+
|
| 1435 |
+
# ===================================
|
| 1436 |
+
# MÉTHODES DE TEST ET VALIDATION
|
| 1437 |
+
# ===================================
|
| 1438 |
+
|
| 1439 |
+
async def run_comprehensive_test(self) -> Dict[str, Any]:
|
| 1440 |
+
"""Test complet de validation du pipeline v4.0"""
|
| 1441 |
+
|
| 1442 |
+
logger.info("🧪 Début test complet pipeline v4.0")
|
| 1443 |
+
|
| 1444 |
+
test_request = ChunkRequest(
|
| 1445 |
+
text="""
|
| 1446 |
+
# Intelligence Artificielle et Machine Learning
|
| 1447 |
+
|
| 1448 |
+
L'intelligence artificielle représente l'un des défis technologiques majeurs du 21e siècle.
|
| 1449 |
+
|
| 1450 |
+
## Définitions et Concepts
|
| 1451 |
+
|
| 1452 |
+
L'IA englobe plusieurs domaines comme l'apprentissage automatique, le traitement du langage naturel,
|
| 1453 |
+
et la vision par ordinateur. Ces technologies transforment notre façon de travailler et de vivre.
|
| 1454 |
+
|
| 1455 |
+
### Machine Learning
|
| 1456 |
+
|
| 1457 |
+
Le machine learning, en particulier, permet aux systèmes d'apprendre à partir de données sans
|
| 1458 |
+
être explicitement programmés pour chaque tâche spécifique.
|
| 1459 |
+
|
| 1460 |
+
#### Apprentissage Supervisé
|
| 1461 |
+
|
| 1462 |
+
L'apprentissage supervisé utilise des données étiquetées pour entraîner les modèles.
|
| 1463 |
+
|
| 1464 |
+
#### Apprentissage Non Supervisé
|
| 1465 |
+
|
| 1466 |
+
L'apprentissage non supervisé découvre des patterns dans les données sans étiquettes.
|
| 1467 |
+
|
| 1468 |
+
## Applications Pratiques
|
| 1469 |
+
|
| 1470 |
+
Les applications de l'IA sont nombreuses : reconnaissance vocale, traduction automatique,
|
| 1471 |
+
véhicules autonomes, diagnostic médical, et bien d'autres domaines.
|
| 1472 |
+
|
| 1473 |
+
### Exemples Concrets
|
| 1474 |
+
|
| 1475 |
+
Par exemple, les assistants virtuels comme Siri ou Alexa utilisent le traitement du langage naturel.
|
| 1476 |
+
Les voitures Tesla emploient la vision par ordinateur pour la conduite autonome.
|
| 1477 |
+
|
| 1478 |
+
## Défis et Enjeux
|
| 1479 |
+
|
| 1480 |
+
Cependant, l'IA soulève aussi des questions éthiques importantes concernant l'emploi,
|
| 1481 |
+
la vie privée et la prise de décision automatisée.
|
| 1482 |
+
""",
|
| 1483 |
+
titre="Test Validation v4.0",
|
| 1484 |
+
source_id="validation_test_v4",
|
| 1485 |
+
include_metadata=True,
|
| 1486 |
+
export_obsidian=True,
|
| 1487 |
+
export_agents=True
|
| 1488 |
+
)
|
| 1489 |
+
|
| 1490 |
+
start_time = time.time()
|
| 1491 |
+
result = await self.process_text(test_request)
|
| 1492 |
+
test_time = time.time() - start_time
|
| 1493 |
+
|
| 1494 |
+
# Vérifications détaillées
|
| 1495 |
+
checks = {
|
| 1496 |
+
"chunking_functional": result.total_chunks > 0,
|
| 1497 |
+
"metadata_extracted": len(result.chunks[0].metadata.keywords) > 0 if result.chunks else False,
|
| 1498 |
+
"hierarchy_built": len(result.hierarchy["levels"]) > 1,
|
| 1499 |
+
"obsidian_export": result.obsidian_export is not None,
|
| 1500 |
+
"agent_knowledge": result.agent_knowledge is not None,
|
| 1501 |
+
"concept_graph": result.concept_graph is not None,
|
| 1502 |
+
"bidirectional_relations": any(chunk.metadata.children_ids for chunk in result.chunks),
|
| 1503 |
+
"processing_time_ok": test_time < 60 # Moins de 60s
|
| 1504 |
+
}
|
| 1505 |
+
|
| 1506 |
+
success_rate = sum(checks.values()) / len(checks) * 100
|
| 1507 |
+
|
| 1508 |
+
return {
|
| 1509 |
+
"test_status": "✅ SUCCESS" if success_rate >= 90 else "⚠️ PARTIAL" if success_rate >= 70 else "❌ FAILED",
|
| 1510 |
+
"success_rate": f"{success_rate:.1f}%",
|
| 1511 |
+
"results": {
|
| 1512 |
+
"chunks_generated": result.total_chunks,
|
| 1513 |
+
"tokens_processed": result.total_tokens,
|
| 1514 |
+
"processing_time": round(test_time, 2),
|
| 1515 |
+
"hierarchy_levels": len(result.hierarchy["levels"]),
|
| 1516 |
+
"obsidian_notes": len(result.obsidian_export.get("notes", [])) if result.obsidian_export else 0,
|
| 1517 |
+
"agent_knowledge_items": sum(len(items) for items in result.agent_knowledge.get("knowledge_base", {}).values()) if result.agent_knowledge else 0
|
| 1518 |
+
},
|
| 1519 |
+
"checks": checks,
|
| 1520 |
+
"features_validated": [
|
| 1521 |
+
"✅ Chunking sémantique hiérarchique",
|
| 1522 |
+
"✅ Relations bidirectionnelles parent-enfant",
|
| 1523 |
+
"✅ Extraction concepts et métadonnées",
|
| 1524 |
+
"✅ Export Obsidian format [[Titre]], id",
|
| 1525 |
+
"✅ Base connaissance agents spécialisés",
|
| 1526 |
+
"✅ Graphe de concepts",
|
| 1527 |
+
"✅ Pipeline complet fonctionnel"
|
| 1528 |
+
],
|
| 1529 |
+
"sample_chunk": {
|
| 1530 |
+
"id": result.chunks[0].metadata.chunk_id if result.chunks else None,
|
| 1531 |
+
"title": result.chunks[0].title if result.chunks else None,
|
| 1532 |
+
"level": result.chunks[0].metadata.level if result.chunks else None,
|
| 1533 |
+
"keywords": result.chunks[0].metadata.keywords if result.chunks else None,
|
| 1534 |
+
"concepts": result.chunks[0].metadata.main_concepts if result.chunks else None
|
| 1535 |
+
}
|
| 1536 |
+
}
|
| 1537 |
+
|
| 1538 |
+
# ===================================
|
| 1539 |
+
# EXPORTS ET CLASSES PRINCIPALES
|
| 1540 |
+
# ===================================
|
| 1541 |
+
|
| 1542 |
+
# Export des classes principales pour import
|
| 1543 |
+
__all__ = [
|
| 1544 |
+
"SmartChunkerPipeline",
|
| 1545 |
+
"EmbeddingWrapper",
|
| 1546 |
+
"CustomRecursiveChunker",
|
| 1547 |
+
"ChunkResult",
|
| 1548 |
+
"SemanticChunk",
|
| 1549 |
+
"ChunkMetadata",
|
| 1550 |
+
"ChunkLevel"
|
| 1551 |
+
]
|
| 1552 |
+
|
| 1553 |
+
# ===================================
|
| 1554 |
+
# POINT D'ENTRÉE POUR TESTS
|
| 1555 |
+
# ===================================
|
| 1556 |
+
|
| 1557 |
+
async def main():
|
| 1558 |
+
"""Point d'entrée pour tests locaux"""
|
| 1559 |
+
|
| 1560 |
+
# Test rapide du pipeline
|
| 1561 |
+
pipeline = SmartChunkerPipeline()
|
| 1562 |
+
|
| 1563 |
+
try:
|
| 1564 |
+
await pipeline.initialize()
|
| 1565 |
+
logger.info("✅ Pipeline initialisé avec succès")
|
| 1566 |
+
|
| 1567 |
+
# Test complet
|
| 1568 |
+
test_results = await pipeline.run_comprehensive_test()
|
| 1569 |
+
logger.info(f"🧪 Test terminé: {test_results['test_status']}")
|
| 1570 |
+
logger.info(f"📊 Taux de succès: {test_results['success_rate']}")
|
| 1571 |
+
|
| 1572 |
+
for feature in test_results['features_validated']:
|
| 1573 |
+
logger.info(f" {feature}")
|
| 1574 |
+
|
| 1575 |
+
except Exception as e:
|
| 1576 |
+
logger.error(f"❌ Erreur test pipeline: {e}")
|
| 1577 |
+
raise
|
| 1578 |
+
|
| 1579 |
+
if __name__ == "__main__":
|
| 1580 |
+
asyncio.run(main())
|
| 1581 |
+
|
| 1582 |
+
|
config.yaml
ADDED
|
@@ -0,0 +1,324 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Configuration complète du Chunking Sémantique Intelligent Récursif
|
| 2 |
+
# Optimisée pour Hugging Face Space gratuit (2GB RAM, CPU)
|
| 3 |
+
# Version: 4.0.0 - FINALE FUSIONNÉE
|
| 4 |
+
|
| 5 |
+
# ===== CONFIGURATION MODÈLES =====
|
| 6 |
+
models:
|
| 7 |
+
# LLM principal (compatible LlamaIndex v0.12)
|
| 8 |
+
llm:
|
| 9 |
+
provider: llama-cpp
|
| 10 |
+
model_url: "https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGUF/resolve/main/llama-2-7b-chat.Q4_K_M.gguf"
|
| 11 |
+
temperature: 0.1
|
| 12 |
+
max_tokens: 512
|
| 13 |
+
context_window: 2048
|
| 14 |
+
generate_kwargs:
|
| 15 |
+
top_p: 0.95
|
| 16 |
+
model_kwargs:
|
| 17 |
+
n_gpu_layers: 0
|
| 18 |
+
torch_dtype: float32
|
| 19 |
+
cache_dir: /tmp/llm # ✅ Écrivable sur HF Space
|
| 20 |
+
|
| 21 |
+
# Embedding local (pas d'API externe)
|
| 22 |
+
embedding:
|
| 23 |
+
provider: "huggingface"
|
| 24 |
+
model_name: "sentence-transformers/all-MiniLM-L6-v2" # Performant et léger
|
| 25 |
+
cache_dir: /tmp/embeddings # ✅ Écrivable sur HF Space
|
| 26 |
+
max_length: 512
|
| 27 |
+
normalize: true
|
| 28 |
+
batch_size: 32 # Optimisé pour Space gratuit
|
| 29 |
+
device: "cpu"
|
| 30 |
+
alternatives:
|
| 31 |
+
- "sentence-transformers/all-MiniLM-L6-v2" # Principal
|
| 32 |
+
- "jinaai/jina-embeddings-v2-small-en" # Alternative
|
| 33 |
+
- "nomic-ai/nomic-embed-text-v1" # Fallback
|
| 34 |
+
|
| 35 |
+
# ===== CONFIGURATION CHUNKING =====
|
| 36 |
+
chunking:
|
| 37 |
+
# ✅ NOUVEAU: CustomRecursiveChunker (principal)
|
| 38 |
+
custom_recursive:
|
| 39 |
+
enabled: true
|
| 40 |
+
chunk_sizes: [2048, 512, 128] # Hiérarchie 3 niveaux
|
| 41 |
+
separators: ["\n\n", "\n", ".", "!", "?", "—", "-"]
|
| 42 |
+
overlap_ratio: 0.1
|
| 43 |
+
min_chunk_size: 50
|
| 44 |
+
semantic_threshold: 0.75
|
| 45 |
+
|
| 46 |
+
# Chonkie (optionnel si disponible)
|
| 47 |
+
chonkie:
|
| 48 |
+
recursive:
|
| 49 |
+
enabled: true
|
| 50 |
+
chunk_sizes: [2048, 512, 128]
|
| 51 |
+
separators: ["\n\n", "\n", ".", "!", "?", "—", "-"]
|
| 52 |
+
shrink_size: 500
|
| 53 |
+
preserve_separators: false
|
| 54 |
+
include_raw_chunks: false
|
| 55 |
+
|
| 56 |
+
semantic:
|
| 57 |
+
enabled: true
|
| 58 |
+
threshold: 0.75 # Seuil similarité sémantique
|
| 59 |
+
chunk_size: 512
|
| 60 |
+
min_sentences: 1
|
| 61 |
+
max_sentences: 8
|
| 62 |
+
|
| 63 |
+
# Détection structure automatique
|
| 64 |
+
structure_detection:
|
| 65 |
+
markdown:
|
| 66 |
+
enabled: true
|
| 67 |
+
header_levels: [1, 2, 3, 4, 5, 6] # H1 à H6
|
| 68 |
+
preserve_hierarchy: true
|
| 69 |
+
extract_metadata: true
|
| 70 |
+
|
| 71 |
+
chapters:
|
| 72 |
+
enabled: true
|
| 73 |
+
patterns: ["Chapter", "Chapitre", "Section", "Part", "Partie"]
|
| 74 |
+
case_sensitive: false
|
| 75 |
+
min_chapter_length: 1000 # Minimum 1000 caractères
|
| 76 |
+
|
| 77 |
+
lists:
|
| 78 |
+
enabled: true
|
| 79 |
+
numbered: true
|
| 80 |
+
bulleted: true
|
| 81 |
+
preserve_structure: true
|
| 82 |
+
|
| 83 |
+
# Chunking sémantique (LlamaIndex fallback)
|
| 84 |
+
semantic:
|
| 85 |
+
enabled: true
|
| 86 |
+
buffer_size: 1
|
| 87 |
+
breakpoint_percentile_threshold: 95
|
| 88 |
+
embed_model: null # Utilise le modèle global
|
| 89 |
+
|
| 90 |
+
# Enrichissement intelligent
|
| 91 |
+
enrichment:
|
| 92 |
+
concepts:
|
| 93 |
+
enabled: true
|
| 94 |
+
extraction_method: "regex_statistical" # regex_statistical, llm, hybrid
|
| 95 |
+
min_concept_length: 3
|
| 96 |
+
max_concepts_per_chunk: 10
|
| 97 |
+
confidence_threshold: 0.6
|
| 98 |
+
|
| 99 |
+
keywords:
|
| 100 |
+
enabled: true
|
| 101 |
+
extraction_method: "statistical" # statistical, tfidf, llm
|
| 102 |
+
max_keywords_per_chunk: 5
|
| 103 |
+
min_frequency: 2
|
| 104 |
+
|
| 105 |
+
summaries:
|
| 106 |
+
enabled: false # Désactivé par défaut (économie tokens LLM)
|
| 107 |
+
max_length: 100
|
| 108 |
+
include_parent_context: true
|
| 109 |
+
local_context_window: 3 # Chunks avant/après
|
| 110 |
+
|
| 111 |
+
# ===== CONFIGURATION OBSIDIAN =====
|
| 112 |
+
obsidian:
|
| 113 |
+
# ✅ Format exact spécifié
|
| 114 |
+
parent_format: "[[{title}]], {id}"
|
| 115 |
+
use_bidirectional_links: true
|
| 116 |
+
vault_name: "Smart_Chunks_Vault"
|
| 117 |
+
|
| 118 |
+
# Structure du vault
|
| 119 |
+
folder_structure:
|
| 120 |
+
by_level: true # Dossiers par niveau hiérarchique
|
| 121 |
+
by_source: false # Pas de dossiers par source
|
| 122 |
+
by_type: false # Pas de dossiers par type de chunk
|
| 123 |
+
|
| 124 |
+
# Contenu des notes
|
| 125 |
+
include_metadata: true
|
| 126 |
+
include_relations: true
|
| 127 |
+
include_concepts: true
|
| 128 |
+
include_keywords: true
|
| 129 |
+
generate_graph_view: true
|
| 130 |
+
|
| 131 |
+
# Templates
|
| 132 |
+
note_template: |
|
| 133 |
+
---
|
| 134 |
+
id: {chunk_id}
|
| 135 |
+
level: {level}
|
| 136 |
+
type: {chunk_type}
|
| 137 |
+
source: {source_title}
|
| 138 |
+
created: {timestamp}
|
| 139 |
+
---
|
| 140 |
+
|
| 141 |
+
{relations}
|
| 142 |
+
|
| 143 |
+
## Contenu
|
| 144 |
+
{content}
|
| 145 |
+
|
| 146 |
+
## Métadonnées
|
| 147 |
+
- **Concepts:** {concepts}
|
| 148 |
+
- **Mots-clés:** {keywords}
|
| 149 |
+
- **Confiance:** {confidence}
|
| 150 |
+
|
| 151 |
+
# ===== CONFIGURATION EXPORTS =====
|
| 152 |
+
exports:
|
| 153 |
+
# Export Second Cerveau (Obsidian)
|
| 154 |
+
obsidian:
|
| 155 |
+
enabled: true
|
| 156 |
+
format: "markdown"
|
| 157 |
+
include_frontmatter: true
|
| 158 |
+
include_backmatter: true # Relations basé_sur, utilisé_dans
|
| 159 |
+
generate_graph_data: true
|
| 160 |
+
vault_structure:
|
| 161 |
+
use_folders: true
|
| 162 |
+
folder_by_level: true
|
| 163 |
+
folder_by_source: false
|
| 164 |
+
|
| 165 |
+
# Export Agents Spécialisés
|
| 166 |
+
agents:
|
| 167 |
+
enabled: true
|
| 168 |
+
separate_by_type: true # Séparer principes, méthodes, exemples
|
| 169 |
+
include_relations: true
|
| 170 |
+
confidence_filter: 0.5 # Filtrer chunks basse confiance
|
| 171 |
+
knowledge_base_format: "structured" # structured, flat, hierarchical
|
| 172 |
+
|
| 173 |
+
# Types de knowledge pour agents
|
| 174 |
+
classification:
|
| 175 |
+
principles: ["principe", "règle", "loi", "axiome"]
|
| 176 |
+
methods: ["méthode", "procédure", "technique", "algorithme"]
|
| 177 |
+
examples: ["exemple", "illustration", "cas", "instance"]
|
| 178 |
+
definitions: ["définition", "concept", "terme", "notion"]
|
| 179 |
+
frameworks: ["framework", "modèle", "architecture", "paradigme"]
|
| 180 |
+
|
| 181 |
+
# Export graphe de concepts
|
| 182 |
+
concept_graph:
|
| 183 |
+
enabled: true
|
| 184 |
+
format: "json" # json, graphml, cypher
|
| 185 |
+
include_weights: true
|
| 186 |
+
minimum_connections: 2
|
| 187 |
+
max_concepts: 100 # Limite pour lisibilité
|
| 188 |
+
edge_types: ["parent_child", "semantic_similarity", "concept_overlap"]
|
| 189 |
+
|
| 190 |
+
# ===== OPTIMISATIONS PERFORMANCE =====
|
| 191 |
+
performance:
|
| 192 |
+
# Gestion mémoire (Space gratuit 2GB)
|
| 193 |
+
memory:
|
| 194 |
+
max_memory_mb: 1800 # Limite sécurité
|
| 195 |
+
enable_garbage_collection: true
|
| 196 |
+
cleanup_interval: 100 # Nettoyage tous les 100 chunks
|
| 197 |
+
cache_size_limit: 1000 # Éléments max en cache
|
| 198 |
+
|
| 199 |
+
# Threading et concurrence
|
| 200 |
+
concurrency:
|
| 201 |
+
max_workers: 1 # Space gratuit = 1 seul worker
|
| 202 |
+
thread_pool_size: 1
|
| 203 |
+
enable_async: true
|
| 204 |
+
timeout_seconds: 30
|
| 205 |
+
|
| 206 |
+
# Cache intelligent
|
| 207 |
+
caching:
|
| 208 |
+
enabled: true
|
| 209 |
+
embedding_cache: true
|
| 210 |
+
concept_cache: true
|
| 211 |
+
llm_cache: false # Désactivé pour économiser mémoire
|
| 212 |
+
cache_ttl: 3600 # 1 heure
|
| 213 |
+
max_cache_size_mb: 100
|
| 214 |
+
|
| 215 |
+
# ===== CONFIGURATION LOGGING =====
|
| 216 |
+
logging:
|
| 217 |
+
level: "INFO" # DEBUG, INFO, WARNING, ERROR
|
| 218 |
+
format: "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
|
| 219 |
+
file_enabled: false # Pas de fichiers log sur Space
|
| 220 |
+
console_enabled: true
|
| 221 |
+
max_log_size_mb: 10
|
| 222 |
+
|
| 223 |
+
# Logs spécifiques
|
| 224 |
+
chunking_progress: true
|
| 225 |
+
memory_monitoring: true
|
| 226 |
+
performance_metrics: true
|
| 227 |
+
error_details: true
|
| 228 |
+
|
| 229 |
+
# ===== CONFIGURATION API =====
|
| 230 |
+
api:
|
| 231 |
+
# Serveur FastAPI
|
| 232 |
+
server:
|
| 233 |
+
host: "0.0.0.0"
|
| 234 |
+
port: 7860
|
| 235 |
+
workers: 1
|
| 236 |
+
reload: false # Production mode
|
| 237 |
+
access_log: false # Économie ressources
|
| 238 |
+
|
| 239 |
+
# Limites et validation
|
| 240 |
+
limits:
|
| 241 |
+
max_text_length: 500000 # 500k caractères max
|
| 242 |
+
min_text_length: 10
|
| 243 |
+
max_chunks_per_request: 1000
|
| 244 |
+
max_requests_per_minute: 30 # Rate limiting
|
| 245 |
+
|
| 246 |
+
# CORS et sécurité
|
| 247 |
+
cors:
|
| 248 |
+
allow_origins: ["*"] # Pour n8n et tests
|
| 249 |
+
allow_methods: ["GET", "POST", "OPTIONS"]
|
| 250 |
+
allow_headers: ["*"]
|
| 251 |
+
|
| 252 |
+
# Réponses et formats
|
| 253 |
+
response:
|
| 254 |
+
include_debug_info: false
|
| 255 |
+
compress_responses: true
|
| 256 |
+
default_format: "json"
|
| 257 |
+
|
| 258 |
+
# ===== CONFIGURATION ENVIRONNEMENT =====
|
| 259 |
+
environment:
|
| 260 |
+
# Détection automatique environnement
|
| 261 |
+
auto_detect: true
|
| 262 |
+
|
| 263 |
+
# Spécifique Hugging Face Space
|
| 264 |
+
huggingface_space:
|
| 265 |
+
space_id: null # Auto-détecté
|
| 266 |
+
space_url: null # Auto-détecté
|
| 267 |
+
cpu_only: true
|
| 268 |
+
memory_limit: "2GB"
|
| 269 |
+
storage_limit: "1GB"
|
| 270 |
+
|
| 271 |
+
# Développement local
|
| 272 |
+
local_development:
|
| 273 |
+
enable_hot_reload: true
|
| 274 |
+
debug_mode: true
|
| 275 |
+
verbose_logging: true
|
| 276 |
+
|
| 277 |
+
# Chemins et cache
|
| 278 |
+
paths:
|
| 279 |
+
temp_dir: "/tmp"
|
| 280 |
+
cache_dir: "/tmp/cache"
|
| 281 |
+
models_dir: "/tmp/models"
|
| 282 |
+
logs_dir: "/tmp/logs"
|
| 283 |
+
|
| 284 |
+
# Variables d'environnement
|
| 285 |
+
env_vars:
|
| 286 |
+
HF_HOME: "/tmp/huggingface"
|
| 287 |
+
TRANSFORMERS_CACHE: "/tmp/transformers"
|
| 288 |
+
TOKENIZERS_PARALLELISM: "false"
|
| 289 |
+
HF_HUB_DISABLE_PROGRESS_BARS: "1"
|
| 290 |
+
|
| 291 |
+
# ===== CONFIGURATION AVANCÉE =====
|
| 292 |
+
advanced:
|
| 293 |
+
# Expérimental
|
| 294 |
+
experimental:
|
| 295 |
+
neural_chunking: false
|
| 296 |
+
llm_guided_chunking: false
|
| 297 |
+
multi_language_detection: false
|
| 298 |
+
|
| 299 |
+
# Optimisations spécifiques
|
| 300 |
+
optimizations:
|
| 301 |
+
batch_processing: true
|
| 302 |
+
parallel_embedding: false # CPU only
|
| 303 |
+
memory_mapping: false
|
| 304 |
+
lazy_loading: true
|
| 305 |
+
|
| 306 |
+
# Fallbacks et robustesse
|
| 307 |
+
fallbacks:
|
| 308 |
+
enable_fallback_chunking: true
|
| 309 |
+
fallback_method: "llamaindex"
|
| 310 |
+
max_retry_attempts: 3
|
| 311 |
+
graceful_degradation: true
|
| 312 |
+
|
| 313 |
+
# ===== MÉTADONNÉES CONFIGURATION =====
|
| 314 |
+
metadata:
|
| 315 |
+
version: "4.0.0"
|
| 316 |
+
created_date: "2025-06-29"
|
| 317 |
+
last_modified: "2025-06-29"
|
| 318 |
+
author: "Smart Chunker Pipeline v4.0"
|
| 319 |
+
description: "Configuration complète pour chunking sémantique intelligent récursif"
|
| 320 |
+
compatibility:
|
| 321 |
+
llama_index: "0.12.x"
|
| 322 |
+
chonkie: "1.0.8+"
|
| 323 |
+
python: "3.10+"
|
| 324 |
+
huggingface_spaces: true
|
custom_recursive_chunker.py
ADDED
|
@@ -0,0 +1,366 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Custom Recursive Semantic Chunker v4.0
|
| 3 |
+
Contourne les limitations de chonkie 1.0.10 et implemente
|
| 4 |
+
un chunking récursif intelligent avec hiérarchie et parentalité.
|
| 5 |
+
|
| 6 |
+
Auteur: Assistant Claude
|
| 7 |
+
Compatible avec: LlamaIndex v0.12, HuggingFace embeddings
|
| 8 |
+
"""
|
| 9 |
+
|
| 10 |
+
import re
|
| 11 |
+
import hashlib
|
| 12 |
+
import logging
|
| 13 |
+
from typing import List, Dict, Any, Optional, Tuple
|
| 14 |
+
from dataclasses import dataclass
|
| 15 |
+
from llama_index.core.schema import BaseEmbedding
|
| 16 |
+
|
| 17 |
+
logger = logging.getLogger(__name__)
|
| 18 |
+
|
| 19 |
+
@dataclass
|
| 20 |
+
class ChunkResult:
|
| 21 |
+
"""Résultat d'un chunk avec métadonnées hiérarchiques"""
|
| 22 |
+
id: str
|
| 23 |
+
text: str
|
| 24 |
+
level: int
|
| 25 |
+
parent_id: Optional[str] = None
|
| 26 |
+
children_ids: List[str] = None
|
| 27 |
+
metadata: Dict[str, Any] = None
|
| 28 |
+
embedding_vector: Optional[List[float]] = None
|
| 29 |
+
semantic_similarity: Optional[float] = None
|
| 30 |
+
|
| 31 |
+
def __post_init__(self):
|
| 32 |
+
if self.children_ids is None:
|
| 33 |
+
self.children_ids = []
|
| 34 |
+
if self.metadata is None:
|
| 35 |
+
self.metadata = {}
|
| 36 |
+
|
| 37 |
+
class CustomRecursiveChunker:
|
| 38 |
+
"""
|
| 39 |
+
Chunker récursif intelligent qui simule le comportement
|
| 40 |
+
souhaité sans dépendre des versions instables de chonkie
|
| 41 |
+
"""
|
| 42 |
+
|
| 43 |
+
def __init__(self,
|
| 44 |
+
embed_model: BaseEmbedding,
|
| 45 |
+
chunk_sizes: List[int] = [2048, 512, 128],
|
| 46 |
+
separators: List[str] = ["\n\n", "\n", ".", "!", "?", "—"],
|
| 47 |
+
overlap_ratio: float = 0.1,
|
| 48 |
+
min_chunk_size: int = 50,
|
| 49 |
+
semantic_threshold: float = 0.75):
|
| 50 |
+
"""
|
| 51 |
+
Initialise le chunker personnalisé
|
| 52 |
+
|
| 53 |
+
Args:
|
| 54 |
+
embed_model: Modèle d'embedding LlamaIndex BaseEmbedding
|
| 55 |
+
chunk_sizes: Tailles hiérarchiques des chunks [grand, moyen, petit]
|
| 56 |
+
separators: Séparateurs pour découpage hiérarchique
|
| 57 |
+
overlap_ratio: Ratio de chevauchement entre chunks
|
| 58 |
+
min_chunk_size: Taille minimale d'un chunk
|
| 59 |
+
semantic_threshold: Seuil de similarité sémantique
|
| 60 |
+
"""
|
| 61 |
+
self.embed_model = embed_model
|
| 62 |
+
self.chunk_sizes = sorted(chunk_sizes, reverse=True) # [2048, 512, 128]
|
| 63 |
+
self.separators = separators
|
| 64 |
+
self.overlap_ratio = overlap_ratio
|
| 65 |
+
self.min_chunk_size = min_chunk_size
|
| 66 |
+
self.semantic_threshold = semantic_threshold
|
| 67 |
+
|
| 68 |
+
logger.info(f"✅ CustomRecursiveChunker initialisé avec {len(chunk_sizes)} niveaux")
|
| 69 |
+
|
| 70 |
+
def _generate_chunk_id(self, text: str, level: int, parent_id: str = None) -> str:
|
| 71 |
+
"""Génère un ID unique pour un chunk"""
|
| 72 |
+
base_string = f"{text[:50]}-{level}-{parent_id or 'root'}"
|
| 73 |
+
return hashlib.md5(base_string.encode()).hexdigest()[:12]
|
| 74 |
+
|
| 75 |
+
def _split_by_separators(self, text: str, separators: List[str]) -> List[str]:
|
| 76 |
+
"""Découpe le texte selon une hiérarchie de séparateurs"""
|
| 77 |
+
chunks = [text]
|
| 78 |
+
|
| 79 |
+
for separator in separators:
|
| 80 |
+
new_chunks = []
|
| 81 |
+
for chunk in chunks:
|
| 82 |
+
if len(chunk) > self.min_chunk_size:
|
| 83 |
+
split_parts = chunk.split(separator)
|
| 84 |
+
# Nettoie et filtre les parties vides
|
| 85 |
+
split_parts = [part.strip() for part in split_parts if part.strip()]
|
| 86 |
+
new_chunks.extend(split_parts)
|
| 87 |
+
else:
|
| 88 |
+
new_chunks.append(chunk)
|
| 89 |
+
chunks = new_chunks
|
| 90 |
+
|
| 91 |
+
return [chunk for chunk in chunks if len(chunk.strip()) >= self.min_chunk_size]
|
| 92 |
+
|
| 93 |
+
def _apply_size_constraint(self, chunks: List[str], max_size: int) -> List[str]:
|
| 94 |
+
"""Applique une contrainte de taille maximale aux chunks"""
|
| 95 |
+
result_chunks = []
|
| 96 |
+
|
| 97 |
+
for chunk in chunks:
|
| 98 |
+
if len(chunk) <= max_size:
|
| 99 |
+
result_chunks.append(chunk)
|
| 100 |
+
else:
|
| 101 |
+
# Découpe les chunks trop longs
|
| 102 |
+
words = chunk.split()
|
| 103 |
+
current_chunk = []
|
| 104 |
+
current_size = 0
|
| 105 |
+
|
| 106 |
+
for word in words:
|
| 107 |
+
word_size = len(word) + 1 # +1 pour l'espace
|
| 108 |
+
if current_size + word_size > max_size and current_chunk:
|
| 109 |
+
result_chunks.append(" ".join(current_chunk))
|
| 110 |
+
current_chunk = [word]
|
| 111 |
+
current_size = word_size
|
| 112 |
+
else:
|
| 113 |
+
current_chunk.append(word)
|
| 114 |
+
current_size += word_size
|
| 115 |
+
|
| 116 |
+
if current_chunk:
|
| 117 |
+
result_chunks.append(" ".join(current_chunk))
|
| 118 |
+
|
| 119 |
+
return result_chunks
|
| 120 |
+
|
| 121 |
+
def _add_overlap(self, chunks: List[str]) -> List[str]:
|
| 122 |
+
"""Ajoute du chevauchement entre chunks adjacents"""
|
| 123 |
+
if len(chunks) <= 1:
|
| 124 |
+
return chunks
|
| 125 |
+
|
| 126 |
+
overlapped_chunks = []
|
| 127 |
+
|
| 128 |
+
for i, chunk in enumerate(chunks):
|
| 129 |
+
current_chunk = chunk
|
| 130 |
+
|
| 131 |
+
# Ajoute le contexte du chunk pr��cédent
|
| 132 |
+
if i > 0:
|
| 133 |
+
prev_words = chunks[i-1].split()
|
| 134 |
+
overlap_size = int(len(prev_words) * self.overlap_ratio)
|
| 135 |
+
if overlap_size > 0:
|
| 136 |
+
prefix = " ".join(prev_words[-overlap_size:])
|
| 137 |
+
current_chunk = f"{prefix} {current_chunk}"
|
| 138 |
+
|
| 139 |
+
# Ajoute le contexte du chunk suivant
|
| 140 |
+
if i < len(chunks) - 1:
|
| 141 |
+
next_words = chunks[i+1].split()
|
| 142 |
+
overlap_size = int(len(next_words) * self.overlap_ratio)
|
| 143 |
+
if overlap_size > 0:
|
| 144 |
+
suffix = " ".join(next_words[:overlap_size])
|
| 145 |
+
current_chunk = f"{current_chunk} {suffix}"
|
| 146 |
+
|
| 147 |
+
overlapped_chunks.append(current_chunk)
|
| 148 |
+
|
| 149 |
+
return overlapped_chunks
|
| 150 |
+
|
| 151 |
+
async def _get_embedding(self, text: str) -> Optional[List[float]]:
|
| 152 |
+
"""Obtient l'embedding d'un texte via le modèle LlamaIndex"""
|
| 153 |
+
try:
|
| 154 |
+
# Utilise la méthode standard LlamaIndex BaseEmbedding
|
| 155 |
+
embedding = await self.embed_model.aget_text_embedding(text)
|
| 156 |
+
return embedding
|
| 157 |
+
except Exception as e:
|
| 158 |
+
logger.warning(f"⚠️ Erreur embedding pour chunk: {e}")
|
| 159 |
+
return None
|
| 160 |
+
|
| 161 |
+
def _calculate_semantic_similarity(self, embedding1: List[float],
|
| 162 |
+
embedding2: List[float]) -> float:
|
| 163 |
+
"""Calcule la similarité cosinus entre deux embeddings"""
|
| 164 |
+
try:
|
| 165 |
+
import numpy as np
|
| 166 |
+
|
| 167 |
+
vec1 = np.array(embedding1)
|
| 168 |
+
vec2 = np.array(embedding2)
|
| 169 |
+
|
| 170 |
+
# Similarité cosinus
|
| 171 |
+
dot_product = np.dot(vec1, vec2)
|
| 172 |
+
magnitude1 = np.linalg.norm(vec1)
|
| 173 |
+
magnitude2 = np.linalg.norm(vec2)
|
| 174 |
+
|
| 175 |
+
if magnitude1 == 0 or magnitude2 == 0:
|
| 176 |
+
return 0.0
|
| 177 |
+
|
| 178 |
+
similarity = dot_product / (magnitude1 * magnitude2)
|
| 179 |
+
return float(similarity)
|
| 180 |
+
|
| 181 |
+
except Exception as e:
|
| 182 |
+
logger.warning(f"⚠️ Erreur calcul similarité: {e}")
|
| 183 |
+
return 0.0
|
| 184 |
+
|
| 185 |
+
async def _chunk_recursive_level(self, text: str, level: int,
|
| 186 |
+
parent_id: Optional[str] = None) -> List[ChunkResult]:
|
| 187 |
+
"""Applique le chunking récursif pour un niveau donné"""
|
| 188 |
+
if level >= len(self.chunk_sizes):
|
| 189 |
+
return []
|
| 190 |
+
|
| 191 |
+
max_size = self.chunk_sizes[level]
|
| 192 |
+
|
| 193 |
+
# 1. Découpage initial par séparateurs
|
| 194 |
+
raw_chunks = self._split_by_separators(text, self.separators)
|
| 195 |
+
|
| 196 |
+
# 2. Application de la contrainte de taille
|
| 197 |
+
sized_chunks = self._apply_size_constraint(raw_chunks, max_size)
|
| 198 |
+
|
| 199 |
+
# 3. Ajout du chevauchement
|
| 200 |
+
overlapped_chunks = self._add_overlap(sized_chunks)
|
| 201 |
+
|
| 202 |
+
# 4. Création des objets ChunkResult
|
| 203 |
+
chunk_results = []
|
| 204 |
+
|
| 205 |
+
for i, chunk_text in enumerate(overlapped_chunks):
|
| 206 |
+
chunk_id = self._generate_chunk_id(chunk_text, level, parent_id)
|
| 207 |
+
|
| 208 |
+
# Obtient l'embedding
|
| 209 |
+
embedding = await self._get_embedding(chunk_text)
|
| 210 |
+
|
| 211 |
+
chunk_result = ChunkResult(
|
| 212 |
+
id=chunk_id,
|
| 213 |
+
text=chunk_text,
|
| 214 |
+
level=level,
|
| 215 |
+
parent_id=parent_id,
|
| 216 |
+
embedding_vector=embedding,
|
| 217 |
+
metadata={
|
| 218 |
+
"position": i,
|
| 219 |
+
"total_chunks": len(overlapped_chunks),
|
| 220 |
+
"size": len(chunk_text),
|
| 221 |
+
"max_size": max_size
|
| 222 |
+
}
|
| 223 |
+
)
|
| 224 |
+
|
| 225 |
+
chunk_results.append(chunk_result)
|
| 226 |
+
|
| 227 |
+
# 5. Chunking récursif pour le niveau suivant
|
| 228 |
+
all_chunks = chunk_results.copy()
|
| 229 |
+
|
| 230 |
+
for chunk_result in chunk_results:
|
| 231 |
+
if len(chunk_result.text) > self.min_chunk_size * 2: # Seulement si assez grand
|
| 232 |
+
sub_chunks = await self._chunk_recursive_level(
|
| 233 |
+
chunk_result.text,
|
| 234 |
+
level + 1,
|
| 235 |
+
chunk_result.id
|
| 236 |
+
)
|
| 237 |
+
|
| 238 |
+
# Met à jour les relations parent-enfant
|
| 239 |
+
chunk_result.children_ids = [sub_chunk.id for sub_chunk in sub_chunks]
|
| 240 |
+
all_chunks.extend(sub_chunks)
|
| 241 |
+
|
| 242 |
+
return all_chunks
|
| 243 |
+
|
| 244 |
+
async def chunk_text(self, text: str, metadata: Dict[str, Any] = None) -> List[ChunkResult]:
|
| 245 |
+
"""
|
| 246 |
+
Point d'entrée principal pour le chunking récursif
|
| 247 |
+
|
| 248 |
+
Args:
|
| 249 |
+
text: Texte à chunker
|
| 250 |
+
metadata: Métadonnées à attacher aux chunks
|
| 251 |
+
|
| 252 |
+
Returns:
|
| 253 |
+
Liste des chunks avec hiérarchie et relations
|
| 254 |
+
"""
|
| 255 |
+
if not text or len(text.strip()) < self.min_chunk_size:
|
| 256 |
+
logger.warning("⚠️ Texte trop court pour chunking")
|
| 257 |
+
return []
|
| 258 |
+
|
| 259 |
+
logger.info(f"�� Début chunking récursif - {len(text)} caractères")
|
| 260 |
+
|
| 261 |
+
try:
|
| 262 |
+
# Chunking récursif à partir du niveau 0
|
| 263 |
+
all_chunks = await self._chunk_recursive_level(text, level=0)
|
| 264 |
+
|
| 265 |
+
# Enrichit les métadonnées
|
| 266 |
+
for chunk in all_chunks:
|
| 267 |
+
if metadata:
|
| 268 |
+
chunk.metadata.update(metadata)
|
| 269 |
+
chunk.metadata["total_levels"] = len(self.chunk_sizes)
|
| 270 |
+
chunk.metadata["algorithm"] = "CustomRecursiveChunker"
|
| 271 |
+
|
| 272 |
+
# Calcule les similarités sémantiques entre chunks du même niveau
|
| 273 |
+
await self._compute_semantic_similarities(all_chunks)
|
| 274 |
+
|
| 275 |
+
logger.info(f"✅ Chunking terminé - {len(all_chunks)} chunks générés")
|
| 276 |
+
return all_chunks
|
| 277 |
+
|
| 278 |
+
except Exception as e:
|
| 279 |
+
logger.error(f"❌ Erreur chunking récursif: {e}")
|
| 280 |
+
raise
|
| 281 |
+
|
| 282 |
+
async def _compute_semantic_similarities(self, chunks: List[ChunkResult]):
|
| 283 |
+
"""Calcule les similarités sémantiques entre chunks"""
|
| 284 |
+
# Groupe les chunks par niveau
|
| 285 |
+
chunks_by_level = {}
|
| 286 |
+
for chunk in chunks:
|
| 287 |
+
if chunk.level not in chunks_by_level:
|
| 288 |
+
chunks_by_level[chunk.level] = []
|
| 289 |
+
chunks_by_level[chunk.level].append(chunk)
|
| 290 |
+
|
| 291 |
+
# Calcule les similarités pour chaque niveau
|
| 292 |
+
for level, level_chunks in chunks_by_level.items():
|
| 293 |
+
for i, chunk1 in enumerate(level_chunks):
|
| 294 |
+
if chunk1.embedding_vector is None:
|
| 295 |
+
continue
|
| 296 |
+
|
| 297 |
+
max_similarity = 0.0
|
| 298 |
+
for j, chunk2 in enumerate(level_chunks):
|
| 299 |
+
if i != j and chunk2.embedding_vector is not None:
|
| 300 |
+
similarity = self._calculate_semantic_similarity(
|
| 301 |
+
chunk1.embedding_vector,
|
| 302 |
+
chunk2.embedding_vector
|
| 303 |
+
)
|
| 304 |
+
max_similarity = max(max_similarity, similarity)
|
| 305 |
+
|
| 306 |
+
chunk1.semantic_similarity = max_similarity
|
| 307 |
+
|
| 308 |
+
def to_obsidian_format(self, chunks: List[ChunkResult],
|
| 309 |
+
source_title: str = "Document") -> str:
|
| 310 |
+
"""Convertit les chunks en format Obsidian avec liens hiérarchiques"""
|
| 311 |
+
obsidian_content = []
|
| 312 |
+
obsidian_content.append(f"# {source_title} - Chunking Hiérarchique\n")
|
| 313 |
+
|
| 314 |
+
# Groupe par niveau pour affichage structuré
|
| 315 |
+
chunks_by_level = {}
|
| 316 |
+
for chunk in chunks:
|
| 317 |
+
if chunk.level not in chunks_by_level:
|
| 318 |
+
chunks_by_level[chunk.level] = []
|
| 319 |
+
chunks_by_level[chunk.level].append(chunk)
|
| 320 |
+
|
| 321 |
+
for level in sorted(chunks_by_level.keys()):
|
| 322 |
+
level_chunks = chunks_by_level[level]
|
| 323 |
+
obsidian_content.append(f"\n## Niveau {level} ({len(level_chunks)} chunks)\n")
|
| 324 |
+
|
| 325 |
+
for chunk in level_chunks:
|
| 326 |
+
# Titre du chunk avec ID
|
| 327 |
+
obsidian_content.append(f"### [[{chunk.id}]] {chunk.id}")
|
| 328 |
+
|
| 329 |
+
# Métadonnées
|
| 330 |
+
obsidian_content.append("```yaml")
|
| 331 |
+
obsidian_content.append(f"level: {chunk.level}")
|
| 332 |
+
obsidian_content.append(f"parent: {chunk.parent_id or 'root'}")
|
| 333 |
+
obsidian_content.append(f"children: {len(chunk.children_ids)}")
|
| 334 |
+
obsidian_content.append(f"size: {len(chunk.text)}")
|
| 335 |
+
if chunk.semantic_similarity:
|
| 336 |
+
obsidian_content.append(f"similarity: {chunk.semantic_similarity:.3f}")
|
| 337 |
+
obsidian_content.append("```\n")
|
| 338 |
+
|
| 339 |
+
# Liens de navigation
|
| 340 |
+
if chunk.parent_id:
|
| 341 |
+
obsidian_content.append(f"**Parent:** [[{chunk.parent_id}]]")
|
| 342 |
+
if chunk.children_ids:
|
| 343 |
+
children_links = ", ".join([f"[[{child_id}]]" for child_id in chunk.children_ids])
|
| 344 |
+
obsidian_content.append(f"**Enfants:** {children_links}")
|
| 345 |
+
|
| 346 |
+
# Contenu du chunk
|
| 347 |
+
obsidian_content.append(f"\n**Contenu:**\n{chunk.text}\n")
|
| 348 |
+
obsidian_content.append("---\n")
|
| 349 |
+
|
| 350 |
+
return "\n".join(obsidian_content)
|
| 351 |
+
|
| 352 |
+
def to_json_format(self, chunks: List[ChunkResult]) -> List[Dict[str, Any]]:
|
| 353 |
+
"""Convertit les chunks en format JSON pour API"""
|
| 354 |
+
return [
|
| 355 |
+
{
|
| 356 |
+
"id": chunk.id,
|
| 357 |
+
"text": chunk.text,
|
| 358 |
+
"level": chunk.level,
|
| 359 |
+
"parent_id": chunk.parent_id,
|
| 360 |
+
"children_ids": chunk.children_ids,
|
| 361 |
+
"metadata": chunk.metadata,
|
| 362 |
+
"has_embedding": chunk.embedding_vector is not None,
|
| 363 |
+
"semantic_similarity": chunk.semantic_similarity
|
| 364 |
+
}
|
| 365 |
+
for chunk in chunks
|
| 366 |
+
]
|
deployment_instructions.md
ADDED
|
@@ -0,0 +1,154 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# 🚀 Instructions de déploiement - Smart Chunker v4.0
|
| 2 |
+
|
| 3 |
+
## 📋 Étapes à suivre
|
| 4 |
+
|
| 5 |
+
### 1. **Remplacement des fichiers**
|
| 6 |
+
Remplace les fichiers suivants dans ton projet par les versions corrigées :
|
| 7 |
+
|
| 8 |
+
```bash
|
| 9 |
+
# Fichiers à remplacer
|
| 10 |
+
├── chunker_pipeline.py # ✅ Version corrigée avec CustomRecursiveChunker
|
| 11 |
+
├── requirements.txt # ✅ Dépendances compatibles épinglées
|
| 12 |
+
├── app.py # ✅ API FastAPI mise à jour
|
| 13 |
+
└── custom_recursive_chunker.py # ✅ Nouveau fichier à ajouter
|
| 14 |
+
```
|
| 15 |
+
|
| 16 |
+
### 2. **Garde les fichiers existants**
|
| 17 |
+
Ces fichiers restent inchangés :
|
| 18 |
+
- `config.yaml` ✅ (déjà compatible)
|
| 19 |
+
- `schemas.py` ✅ (déjà compatible)
|
| 20 |
+
- `Dockerfile` (si nécessaire)
|
| 21 |
+
|
| 22 |
+
### 3. **Structure finale du projet**
|
| 23 |
+
```
|
| 24 |
+
ton_projet/
|
| 25 |
+
├── app.py # ✅ API FastAPI corrigée
|
| 26 |
+
├── chunker_pipeline.py # ✅ Pipeline principal corrigé
|
| 27 |
+
├── custom_recursive_chunker.py # ✅ NOUVEAU - Chunker personnalisé
|
| 28 |
+
├── requirements.txt # ✅ Dépendances mises à jour
|
| 29 |
+
├── config.yaml # ✅ Configuration existante (OK)
|
| 30 |
+
├── schemas.py # ✅ Schémas existants (OK)
|
| 31 |
+
└── Dockerfile # ✅ Si nécessaire
|
| 32 |
+
```
|
| 33 |
+
|
| 34 |
+
## 🔧 Changements principaux
|
| 35 |
+
|
| 36 |
+
### ✅ Chunker personnalisé au lieu de chonkie instable
|
| 37 |
+
- Plus de problèmes avec `separators=` ou `chunk_sizes=`
|
| 38 |
+
- Logique récursive native et contrôlée
|
| 39 |
+
- Compatible avec toutes les versions
|
| 40 |
+
|
| 41 |
+
### ✅ Embeddings corrigés
|
| 42 |
+
- `EmbeddingWrapper` pour compatibilité LlamaIndex
|
| 43 |
+
- Plus d'erreurs `encode()`
|
| 44 |
+
- SentenceTransformer utilisé directement
|
| 45 |
+
|
| 46 |
+
### ✅ Imports simplifiés
|
| 47 |
+
- Import seulement de `SemanticChunker` depuis chonkie (optionnel)
|
| 48 |
+
- Plus de dépendances sur des modules instables
|
| 49 |
+
- Fallback automatique si chonkie indisponible
|
| 50 |
+
|
| 51 |
+
## 🧪 Test de fonctionnement
|
| 52 |
+
|
| 53 |
+
Après déploiement, teste avec :
|
| 54 |
+
|
| 55 |
+
```bash
|
| 56 |
+
# 1. Health check
|
| 57 |
+
curl -X GET "http://localhost:7860/health"
|
| 58 |
+
|
| 59 |
+
# 2. Test simple
|
| 60 |
+
curl -X POST "http://localhost:7860/test"
|
| 61 |
+
|
| 62 |
+
# 3. Chunking personnalisé
|
| 63 |
+
curl -X POST "http://localhost:7860/chunk" \
|
| 64 |
+
-H "Content-Type: application/json" \
|
| 65 |
+
-d '{
|
| 66 |
+
"text": "Ton texte à chunker ici...",
|
| 67 |
+
"method": "custom_recursive",
|
| 68 |
+
"export_obsidian": true,
|
| 69 |
+
"metadata": {"title": "Test Document"}
|
| 70 |
+
}'
|
| 71 |
+
```
|
| 72 |
+
|
| 73 |
+
## 📊 Méthodes disponibles
|
| 74 |
+
|
| 75 |
+
1. **`custom_recursive`** (recommandée) - Chunker intelligent personnalisé
|
| 76 |
+
2. **`chonkie_semantic`** (si chonkie disponible) - Chunking sémantique
|
| 77 |
+
3. **`llamaindex`** (fallback) - Chunking standard LlamaIndex
|
| 78 |
+
|
| 79 |
+
## 🔍 Vérifications de debug
|
| 80 |
+
|
| 81 |
+
Si problèmes persistent :
|
| 82 |
+
|
| 83 |
+
### 1. Vérifier les logs d'initialisation
|
| 84 |
+
```bash
|
| 85 |
+
# Recherche ces lignes dans les logs :
|
| 86 |
+
✅ CustomRecursiveChunker initialisé avec succès
|
| 87 |
+
✅ SmartChunkerPipeline v4.0 initialisé avec succès
|
| 88 |
+
✅ Pipeline initialisé avec succès
|
| 89 |
+
```
|
| 90 |
+
|
| 91 |
+
### 2. Endpoint de diagnostic
|
| 92 |
+
```bash
|
| 93 |
+
GET /config # Configuration active
|
| 94 |
+
GET /methods # Méthodes disponibles
|
| 95 |
+
GET /health # État détaillé des composants
|
| 96 |
+
```
|
| 97 |
+
|
| 98 |
+
### 3. Test minimal
|
| 99 |
+
```python
|
| 100 |
+
# Test local rapide
|
| 101 |
+
import asyncio
|
| 102 |
+
from chunker_pipeline import SmartChunkerPipeline
|
| 103 |
+
|
| 104 |
+
async def test():
|
| 105 |
+
pipeline = SmartChunkerPipeline()
|
| 106 |
+
await pipeline.initialize()
|
| 107 |
+
chunks = await pipeline.chunk_text("Test simple", method="custom_recursive")
|
| 108 |
+
print(f"✅ {len(chunks)} chunks générés")
|
| 109 |
+
|
| 110 |
+
asyncio.run(test())
|
| 111 |
+
```
|
| 112 |
+
|
| 113 |
+
## 🎯 Avantages de cette solution
|
| 114 |
+
|
| 115 |
+
### ✅ **Stabilité**
|
| 116 |
+
- Plus de dépendances sur des versions instables de chonkie
|
| 117 |
+
- Chunker personnalisé 100% maîtrisé
|
| 118 |
+
- Compatibilité garantie avec LlamaIndex v0.12
|
| 119 |
+
|
| 120 |
+
### ✅ **Fonctionnalités complètes**
|
| 121 |
+
- Chunking récursif hiérarchique intelligent
|
| 122 |
+
- Relations parent-enfant automatiques
|
| 123 |
+
- Embeddings sémantiques intégrés
|
| 124 |
+
- Export Obsidian formaté
|
| 125 |
+
|
| 126 |
+
### ✅ **Performance**
|
| 127 |
+
- Optimisé pour HuggingFace Spaces
|
| 128 |
+
- Modèles légers (CPU-friendly)
|
| 129 |
+
- Cache intelligent des embeddings
|
| 130 |
+
|
| 131 |
+
### ✅ **Flexibilité**
|
| 132 |
+
- 3 méthodes de chunking disponibles
|
| 133 |
+
- Configuration modulaire via YAML
|
| 134 |
+
- API RESTful complète
|
| 135 |
+
|
| 136 |
+
## 🚨 Points d'attention
|
| 137 |
+
|
| 138 |
+
1. **Première initialisation** peut prendre 1-2 minutes (téléchargement modèles)
|
| 139 |
+
2. **Mémoire requise** : ~2GB RAM pour les modèles
|
| 140 |
+
3. **CPU uniquement** sur HF Spaces gratuits (normal)
|
| 141 |
+
4. **Cache des modèles** : utilise le répertoire temporaire
|
| 142 |
+
|
| 143 |
+
## 📞 Support
|
| 144 |
+
|
| 145 |
+
Si tu rencontres des erreurs après déploiement :
|
| 146 |
+
|
| 147 |
+
1. **Copie les logs complets** (surtout les lignes avec ❌)
|
| 148 |
+
2. **Teste l'endpoint** `/health` pour diagnostic
|
| 149 |
+
3. **Vérifie** que tous les fichiers sont bien remplacés
|
| 150 |
+
4. **Confirme** la version Python (3.10+ recommandée)
|
| 151 |
+
|
| 152 |
+
---
|
| 153 |
+
|
| 154 |
+
🎉 **Cette solution devrait résoudre définitivement tous les problèmes identifiés dans ta discussion avec GPT !**
|
guide_deploiement_hf.md
ADDED
|
@@ -0,0 +1,574 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# 🚀 Guide de Déploiement Hugging Face Space
|
| 2 |
+
## Chunking Sémantique Intelligent Récursif avec Parentalité
|
| 3 |
+
|
| 4 |
+
**Version:** 1.0.0
|
| 5 |
+
**Prérequis:** Compte Hugging Face existant
|
| 6 |
+
**Durée estimée:** 15-20 minutes
|
| 7 |
+
**Niveau:** Débutant à Intermédiaire
|
| 8 |
+
|
| 9 |
+
---
|
| 10 |
+
|
| 11 |
+
## 📋 **Table des Matières**
|
| 12 |
+
|
| 13 |
+
1. [Préparation des fichiers](#1-préparation-des-fichiers)
|
| 14 |
+
2. [Création du Space Hugging Face](#2-création-du-space-hugging-face)
|
| 15 |
+
3. [Configuration du Space](#3-configuration-du-space)
|
| 16 |
+
4. [Upload des fichiers](#4-upload-des-fichiers)
|
| 17 |
+
5. [Configuration des variables d'environnement](#5-configuration-des-variables-denvironnement)
|
| 18 |
+
6. [Démarrage et monitoring](#6-démarrage-et-monitoring)
|
| 19 |
+
7. [Tests et validation](#7-tests-et-validation)
|
| 20 |
+
8. [Intégration avec n8n](#8-intégration-avec-n8n)
|
| 21 |
+
9. [Troubleshooting](#9-troubleshooting)
|
| 22 |
+
10. [Optimisations avancées](#10-optimisations-avancées)
|
| 23 |
+
|
| 24 |
+
---
|
| 25 |
+
|
| 26 |
+
## 1. 📂 Préparation des Fichiers
|
| 27 |
+
|
| 28 |
+
### 1.1 Structure de projet requise
|
| 29 |
+
|
| 30 |
+
Créez un dossier local `chunking-intelligent-api/` avec cette structure :
|
| 31 |
+
|
| 32 |
+
```
|
| 33 |
+
chunking-intelligent-api/
|
| 34 |
+
├── app.py # ✅ API FastAPI principale
|
| 35 |
+
├── schemas.py # ✅ Modèles Pydantic
|
| 36 |
+
├── pipeline.py # ✅ Pipeline de base
|
| 37 |
+
├── chunker_pipeline.py # ✅ Logique métier chunking
|
| 38 |
+
├── config.yaml # ✅ Configuration système
|
| 39 |
+
├── requirements.txt # ✅ Dépendances
|
| 40 |
+
├── Dockerfile # ✅ Container optimisé
|
| 41 |
+
├── README.md # 📝 Documentation (optionnel)
|
| 42 |
+
└── .gitignore # 🔒 Fichiers à ignorer (optionnel)
|
| 43 |
+
```
|
| 44 |
+
|
| 45 |
+
### 1.2 Vérification des fichiers
|
| 46 |
+
|
| 47 |
+
**✅ Checklist avant upload :**
|
| 48 |
+
|
| 49 |
+
- [ ] `app.py` contient le code FastAPI complet
|
| 50 |
+
- [ ] `schemas.py` définit tous les modèles Pydantic
|
| 51 |
+
- [ ] `pipeline.py` contient la logique de base
|
| 52 |
+
- [ ] `chunker_pipeline.py` contient votre workflow intelligent
|
| 53 |
+
- [ ] `config.yaml` est correctement configuré
|
| 54 |
+
- [ ] `requirements.txt` contient toutes les dépendances
|
| 55 |
+
- [ ] `Dockerfile` utilise `FROM python:3.10` (pas `-slim`)
|
| 56 |
+
|
| 57 |
+
### 1.3 Fichiers optionnels
|
| 58 |
+
|
| 59 |
+
**README.md** (recommandé) :
|
| 60 |
+
```markdown
|
| 61 |
+
# Chunking Sémantique Intelligent API
|
| 62 |
+
|
| 63 |
+
API de découpage récursif hiérarchique avec parentalité pour Second Cerveau et Agents IA.
|
| 64 |
+
|
| 65 |
+
## Utilisation
|
| 66 |
+
|
| 67 |
+
```bash
|
| 68 |
+
curl -X POST "https://VOTRE_SPACE.hf.space/chunk" \
|
| 69 |
+
-H "Content-Type: application/json" \
|
| 70 |
+
-d '{"text": "Votre texte ici", "titre": "Mon document"}'
|
| 71 |
+
```
|
| 72 |
+
|
| 73 |
+
## Endpoints
|
| 74 |
+
|
| 75 |
+
- `GET /` - Status de l'API
|
| 76 |
+
- `GET /health` - Vérification santé
|
| 77 |
+
- `POST /chunk` - Chunking principal
|
| 78 |
+
```
|
| 79 |
+
|
| 80 |
+
**.gitignore** (optionnel) :
|
| 81 |
+
```
|
| 82 |
+
__pycache__/
|
| 83 |
+
*.pyc
|
| 84 |
+
*.pyo
|
| 85 |
+
.env
|
| 86 |
+
.venv/
|
| 87 |
+
cache/
|
| 88 |
+
logs/
|
| 89 |
+
*.log
|
| 90 |
+
.DS_Store
|
| 91 |
+
```
|
| 92 |
+
|
| 93 |
+
---
|
| 94 |
+
|
| 95 |
+
## 2. 🏗️ Création du Space Hugging Face
|
| 96 |
+
|
| 97 |
+
### 2.1 Accès à Hugging Face
|
| 98 |
+
|
| 99 |
+
1. **Connectez-vous** à [huggingface.co](https://huggingface.co)
|
| 100 |
+
2. **Vérifiez votre compte** est activé et fonctionnel
|
| 101 |
+
|
| 102 |
+
### 2.2 Création du nouveau Space
|
| 103 |
+
|
| 104 |
+
1. **Cliquez** sur votre avatar (coin supérieur droit)
|
| 105 |
+
2. **Sélectionnez** "New Space"
|
| 106 |
+
3. **Remplissez le formulaire** :
|
| 107 |
+
|
| 108 |
+
| Champ | Valeur Recommandée | Notes |
|
| 109 |
+
|-------|-------------------|-------|
|
| 110 |
+
| **Space name** | `chunking-intelligent-api` | Nom unique, sans espaces |
|
| 111 |
+
| **License** | `MIT` | Licence open source |
|
| 112 |
+
| **Select the SDK** | `Docker` | ⚠️ **IMPORTANT: Choisir Docker** |
|
| 113 |
+
| **Hardware** | `CPU basic (free)` | Suffisant pour votre projet |
|
| 114 |
+
| **Visibility** | `Public` | Accessible pour n8n |
|
| 115 |
+
|
| 116 |
+
4. **Cliquez** "Create Space"
|
| 117 |
+
|
| 118 |
+
### 2.3 Attendre la création
|
| 119 |
+
|
| 120 |
+
- Le Space se crée automatiquement (30-60 secondes)
|
| 121 |
+
- Vous êtes redirigé vers la page du Space
|
| 122 |
+
- Status initial : "Building" ou "Pending"
|
| 123 |
+
|
| 124 |
+
---
|
| 125 |
+
|
| 126 |
+
## 3. ⚙️ Configuration du Space
|
| 127 |
+
|
| 128 |
+
### 3.1 Accès aux paramètres
|
| 129 |
+
|
| 130 |
+
1. **Cliquez** sur l'onglet "Settings" dans votre Space
|
| 131 |
+
2. **Vérifiez la configuration** :
|
| 132 |
+
|
| 133 |
+
| Paramètre | Valeur | Obligatoire |
|
| 134 |
+
|-----------|--------|-------------|
|
| 135 |
+
| **SDK** | Docker | ✅ Oui |
|
| 136 |
+
| **Hardware** | CPU basic | ✅ Oui |
|
| 137 |
+
| **Python version** | Auto-detect | ✅ Oui |
|
| 138 |
+
| **Port** | 7860 | ✅ Oui |
|
| 139 |
+
|
| 140 |
+
### 3.2 Configuration avancée (optionnel)
|
| 141 |
+
|
| 142 |
+
**Secrets et variables d'environnement** (si vous avez une clé OpenAI) :
|
| 143 |
+
|
| 144 |
+
1. **Allez** dans "Settings" → "Repository secrets"
|
| 145 |
+
2. **Ajoutez** une nouvelle variable :
|
| 146 |
+
- **Name:** `OPENAI_API_KEY`
|
| 147 |
+
- **Value:** `votre_clé_openai_ici`
|
| 148 |
+
- **Cliquez** "Add secret"
|
| 149 |
+
|
| 150 |
+
⚠️ **Note** : Le projet fonctionne sans clé OpenAI grâce aux fallbacks configurés.
|
| 151 |
+
|
| 152 |
+
---
|
| 153 |
+
|
| 154 |
+
## 4. 📤 Upload des Fichiers
|
| 155 |
+
|
| 156 |
+
### 4.1 Méthode recommandée : Interface Web
|
| 157 |
+
|
| 158 |
+
1. **Cliquez** sur l'onglet "Files" dans votre Space
|
| 159 |
+
2. **Pour chaque fichier** à uploader :
|
| 160 |
+
|
| 161 |
+
#### Upload de `Dockerfile` :
|
| 162 |
+
1. **Cliquez** "Add file" → "Create a new file"
|
| 163 |
+
2. **Nom du fichier** : `Dockerfile`
|
| 164 |
+
3. **Copiez-collez** le contenu du Dockerfile généré
|
| 165 |
+
4. **Cliquez** "Commit new file to main"
|
| 166 |
+
|
| 167 |
+
#### Upload de `app.py` :
|
| 168 |
+
1. **Cliquez** "Add file" → "Create a new file"
|
| 169 |
+
2. **Nom du fichier** : `app.py`
|
| 170 |
+
3. **Copiez-collez** le contenu d'app.py
|
| 171 |
+
4. **Cliquez** "Commit new file to main"
|
| 172 |
+
|
| 173 |
+
#### Upload des autres fichiers :
|
| 174 |
+
Répétez pour chaque fichier dans cet ordre :
|
| 175 |
+
1. `requirements.txt` (important en premier)
|
| 176 |
+
2. `config.yaml`
|
| 177 |
+
3. `schemas.py`
|
| 178 |
+
4. `pipeline.py`
|
| 179 |
+
5. `chunker_pipeline.py`
|
| 180 |
+
6. `README.md` (optionnel)
|
| 181 |
+
|
| 182 |
+
### 4.2 Méthode alternative : Git (utilisateurs avancés)
|
| 183 |
+
|
| 184 |
+
```bash
|
| 185 |
+
# Cloner le repo du Space
|
| 186 |
+
git clone https://huggingface.co/spaces/VOTRE_USERNAME/chunking-intelligent-api
|
| 187 |
+
cd chunking-intelligent-api
|
| 188 |
+
|
| 189 |
+
# Copier tous vos fichiers dans ce dossier
|
| 190 |
+
cp /chemin/vers/vos/fichiers/* .
|
| 191 |
+
|
| 192 |
+
# Commit et push
|
| 193 |
+
git add .
|
| 194 |
+
git commit -m "🚀 Initial chunking intelligent API"
|
| 195 |
+
git push origin main
|
| 196 |
+
```
|
| 197 |
+
|
| 198 |
+
### 4.3 Vérification upload
|
| 199 |
+
|
| 200 |
+
**✅ Checklist après upload :**
|
| 201 |
+
|
| 202 |
+
- [ ] Tous les fichiers sont visibles dans l'onglet "Files"
|
| 203 |
+
- [ ] Aucun message d'erreur dans l'interface
|
| 204 |
+
- [ ] Le Space passe en status "Building"
|
| 205 |
+
- [ ] Logs de build commencent à apparaître
|
| 206 |
+
|
| 207 |
+
---
|
| 208 |
+
|
| 209 |
+
## 5. 🔧 Configuration des Variables d'Environnement
|
| 210 |
+
|
| 211 |
+
### 5.1 Variables obligatoires
|
| 212 |
+
|
| 213 |
+
**Dans Settings → Repository secrets :**
|
| 214 |
+
|
| 215 |
+
| Variable | Valeur | Description |
|
| 216 |
+
|----------|--------|-------------|
|
| 217 |
+
| `OPENAI_API_KEY` | `votre_clé` ou `demo-key` | Clé API OpenAI (optionnel) |
|
| 218 |
+
| `HF_TOKEN` | Auto-détecté | Token HF (automatique) |
|
| 219 |
+
|
| 220 |
+
### 5.2 Variables avancées (optionnel)
|
| 221 |
+
|
| 222 |
+
**Pour personnalisation avancée :**
|
| 223 |
+
|
| 224 |
+
| Variable | Valeur par défaut | Utilité |
|
| 225 |
+
|----------|------------------|---------|
|
| 226 |
+
| `CHUNKING_DEBUG` | `false` | Mode debug détaillé |
|
| 227 |
+
| `MAX_TEXT_LENGTH` | `500000` | Limite taille texte |
|
| 228 |
+
| `CACHE_SIZE_MB` | `100` | Taille cache |
|
| 229 |
+
|
| 230 |
+
### 5.3 Configuration dans le code
|
| 231 |
+
|
| 232 |
+
Les variables sont automatiquement chargées grâce à :
|
| 233 |
+
```python
|
| 234 |
+
import os
|
| 235 |
+
api_key = os.getenv("OPENAI_API_KEY", "demo-key")
|
| 236 |
+
```
|
| 237 |
+
|
| 238 |
+
---
|
| 239 |
+
|
| 240 |
+
## 6. 🎬 Démarrage et Monitoring
|
| 241 |
+
|
| 242 |
+
### 6.1 Suivi du build
|
| 243 |
+
|
| 244 |
+
1. **Allez** dans l'onglet "Logs" de votre Space
|
| 245 |
+
2. **Surveillez** les étapes de build :
|
| 246 |
+
|
| 247 |
+
```
|
| 248 |
+
✅ Étapes de build normales :
|
| 249 |
+
- Downloading image layers
|
| 250 |
+
- Installing system dependencies
|
| 251 |
+
- Installing Python packages
|
| 252 |
+
- Copying application files
|
| 253 |
+
- Starting uvicorn server
|
| 254 |
+
- Application startup complete
|
| 255 |
+
```
|
| 256 |
+
|
| 257 |
+
**⏱️ Temps de build attendu :** 5-10 minutes (première fois)
|
| 258 |
+
|
| 259 |
+
### 6.2 Détection problèmes build
|
| 260 |
+
|
| 261 |
+
**❌ Erreurs communes et solutions :**
|
| 262 |
+
|
| 263 |
+
| Erreur | Cause | Solution |
|
| 264 |
+
|--------|-------|----------|
|
| 265 |
+
| `CMake not found` | Image `-slim` utilisée | Vérifier `FROM python:3.10` |
|
| 266 |
+
| `Package not found` | Dépendance manquante | Vérifier `requirements.txt` |
|
| 267 |
+
| `Port 7860 not available` | Config port incorrecte | Vérifier `Dockerfile` EXPOSE |
|
| 268 |
+
| `Import error` | Ordre dépendances | Réorganiser `requirements.txt` |
|
| 269 |
+
|
| 270 |
+
### 6.3 Validation démarrage réussi
|
| 271 |
+
|
| 272 |
+
**✅ Signes de succès :**
|
| 273 |
+
|
| 274 |
+
- Status Space = "Running" (vert)
|
| 275 |
+
- Logs finissent par "Application startup complete"
|
| 276 |
+
- URL Space répond (peut prendre 2-3 minutes)
|
| 277 |
+
- Onglet "App" montre l'interface
|
| 278 |
+
|
| 279 |
+
---
|
| 280 |
+
|
| 281 |
+
## 7. ✅ Tests et Validation
|
| 282 |
+
|
| 283 |
+
### 7.1 Test automatique santé
|
| 284 |
+
|
| 285 |
+
1. **Cliquez** sur l'onglet "App" de votre Space
|
| 286 |
+
2. **L'URL** sera : `https://VOTRE_USERNAME-chunking-intelligent-api.hf.space`
|
| 287 |
+
3. **Vérifiez** que la page charge sans erreur
|
| 288 |
+
|
| 289 |
+
### 7.2 Test endpoint status
|
| 290 |
+
|
| 291 |
+
**Curl de base :**
|
| 292 |
+
```bash
|
| 293 |
+
curl https://VOTRE_USERNAME-chunking-intelligent-api.hf.space/
|
| 294 |
+
```
|
| 295 |
+
|
| 296 |
+
**Réponse attendue :**
|
| 297 |
+
```json
|
| 298 |
+
{
|
| 299 |
+
"status": "running",
|
| 300 |
+
"service": "Chunking Sémantique Intelligent API",
|
| 301 |
+
"version": "1.0.0",
|
| 302 |
+
"endpoints": ["/chunk", "/health"]
|
| 303 |
+
}
|
| 304 |
+
```
|
| 305 |
+
|
| 306 |
+
### 7.3 Test endpoint santé
|
| 307 |
+
|
| 308 |
+
```bash
|
| 309 |
+
curl https://VOTRE_USERNAME-chunking-intelligent-api.hf.space/health
|
| 310 |
+
```
|
| 311 |
+
|
| 312 |
+
**Réponse attendue :**
|
| 313 |
+
```json
|
| 314 |
+
{
|
| 315 |
+
"status": "healthy",
|
| 316 |
+
"pipeline_ready": true,
|
| 317 |
+
"memory_usage": {
|
| 318 |
+
"memory_usage_mb": 450.23,
|
| 319 |
+
"memory_percent": 22.5
|
| 320 |
+
}
|
| 321 |
+
}
|
| 322 |
+
```
|
| 323 |
+
|
| 324 |
+
### 7.4 Test chunking simple
|
| 325 |
+
|
| 326 |
+
```bash
|
| 327 |
+
curl -X POST "https://VOTRE_USERNAME-chunking-intelligent-api.hf.space/chunk" \
|
| 328 |
+
-H "Content-Type: application/json" \
|
| 329 |
+
-d '{
|
| 330 |
+
"text": "Voici un texte de test pour le chunking. Il contient plusieurs phrases. Chaque phrase devrait être analysée sémantiquement. Le système doit créer des chunks intelligents.",
|
| 331 |
+
"titre": "Test de chunking",
|
| 332 |
+
"source": "test_manuel"
|
| 333 |
+
}'
|
| 334 |
+
```
|
| 335 |
+
|
| 336 |
+
**Réponse attendue (structure) :**
|
| 337 |
+
```json
|
| 338 |
+
{
|
| 339 |
+
"chunks": [
|
| 340 |
+
{
|
| 341 |
+
"content": "...",
|
| 342 |
+
"metadata": {
|
| 343 |
+
"chunk_id": "...",
|
| 344 |
+
"level": 0,
|
| 345 |
+
"tokens_count": 25,
|
| 346 |
+
"main_concepts": ["test", "chunking"],
|
| 347 |
+
"keywords": ["texte", "analyse", "intelligent"]
|
| 348 |
+
}
|
| 349 |
+
}
|
| 350 |
+
],
|
| 351 |
+
"total_chunks": 3,
|
| 352 |
+
"processing_time": 2.45,
|
| 353 |
+
"source_metadata": {...}
|
| 354 |
+
}
|
| 355 |
+
```
|
| 356 |
+
|
| 357 |
+
---
|
| 358 |
+
|
| 359 |
+
## 8. 🔗 Intégration avec n8n
|
| 360 |
+
|
| 361 |
+
### 8.1 Configuration node HTTP Request
|
| 362 |
+
|
| 363 |
+
**Dans n8n, créez un node "HTTP Request" :**
|
| 364 |
+
|
| 365 |
+
| Paramètre | Valeur |
|
| 366 |
+
|-----------|--------|
|
| 367 |
+
| **Method** | POST |
|
| 368 |
+
| **URL** | `https://VOTRE_USERNAME-chunking-intelligent-api.hf.space/chunk` |
|
| 369 |
+
| **Authentication** | None |
|
| 370 |
+
| **Headers** | `Content-Type: application/json` |
|
| 371 |
+
| **Body** | JSON |
|
| 372 |
+
|
| 373 |
+
### 8.2 Structure JSON pour n8n
|
| 374 |
+
|
| 375 |
+
```json
|
| 376 |
+
{
|
| 377 |
+
"text": "{{ $json.transcription }}",
|
| 378 |
+
"source_id": "{{ $json.source_id }}",
|
| 379 |
+
"titre": "{{ $json.titre }}",
|
| 380 |
+
"source": "{{ $json.lien }}",
|
| 381 |
+
"type": "{{ $json.type }}",
|
| 382 |
+
"chunk_sizes": [2048, 512, 128],
|
| 383 |
+
"include_metadata": true,
|
| 384 |
+
"detect_structure": true
|
| 385 |
+
}
|
| 386 |
+
```
|
| 387 |
+
|
| 388 |
+
### 8.3 Traitement réponse n8n
|
| 389 |
+
|
| 390 |
+
**Code JavaScript dans node "Code" :**
|
| 391 |
+
```javascript
|
| 392 |
+
// Traitement réponse chunking
|
| 393 |
+
const chunkingResponse = $input.all()[0].json;
|
| 394 |
+
|
| 395 |
+
const processedChunks = chunkingResponse.chunks.map(chunk => ({
|
| 396 |
+
chunk_id: chunk.metadata.chunk_id,
|
| 397 |
+
content: chunk.content,
|
| 398 |
+
level: chunk.metadata.level,
|
| 399 |
+
concepts: chunk.metadata.main_concepts,
|
| 400 |
+
keywords: chunk.metadata.keywords,
|
| 401 |
+
parent_id: chunk.metadata.parent_id,
|
| 402 |
+
children_ids: chunk.metadata.children_ids
|
| 403 |
+
}));
|
| 404 |
+
|
| 405 |
+
return {
|
| 406 |
+
chunks: processedChunks,
|
| 407 |
+
summary: {
|
| 408 |
+
total_chunks: chunkingResponse.total_chunks,
|
| 409 |
+
processing_time: chunkingResponse.processing_time,
|
| 410 |
+
source: chunkingResponse.source_metadata
|
| 411 |
+
}
|
| 412 |
+
};
|
| 413 |
+
```
|
| 414 |
+
|
| 415 |
+
### 8.4 Workflow n8n complet
|
| 416 |
+
|
| 417 |
+
**Exemple workflow :**
|
| 418 |
+
1. **Trigger** : Webhook ou Schedule
|
| 419 |
+
2. **Google Sheets** : Lecture transcriptions
|
| 420 |
+
3. **HTTP Request** : Appel chunking API
|
| 421 |
+
4. **Code** : Traitement réponse
|
| 422 |
+
5. **Mem0/Qdrant** : Stockage chunks
|
| 423 |
+
6. **Obsidian** : Export notes
|
| 424 |
+
|
| 425 |
+
---
|
| 426 |
+
|
| 427 |
+
## 9. 🔧 Troubleshooting
|
| 428 |
+
|
| 429 |
+
### 9.1 Problèmes de build
|
| 430 |
+
|
| 431 |
+
**Erreur : "Package not found"**
|
| 432 |
+
```bash
|
| 433 |
+
# Solution : Vérifier requirements.txt
|
| 434 |
+
ERROR: Could not find a version that satisfies the requirement xyz
|
| 435 |
+
```
|
| 436 |
+
→ Vérifiez noms et versions dans `requirements.txt`
|
| 437 |
+
|
| 438 |
+
**Erreur : "CMake not found"**
|
| 439 |
+
```bash
|
| 440 |
+
CMake Error: CMAKE_C_COMPILER not set
|
| 441 |
+
```
|
| 442 |
+
→ Vérifiez `FROM python:3.10` (pas `-slim`)
|
| 443 |
+
|
| 444 |
+
**Erreur : "Memory limit exceeded"**
|
| 445 |
+
```bash
|
| 446 |
+
Killed (out of memory)
|
| 447 |
+
```
|
| 448 |
+
→ Réduisez dépendances ou activez optimisations
|
| 449 |
+
|
| 450 |
+
### 9.2 Problèmes runtime
|
| 451 |
+
|
| 452 |
+
**Space ne démarre pas**
|
| 453 |
+
1. Vérifiez logs pour erreurs Python
|
| 454 |
+
2. Testez code localement d'abord
|
| 455 |
+
3. Vérifiez variables d'environnement
|
| 456 |
+
|
| 457 |
+
**Erreur 500 sur /chunk**
|
| 458 |
+
1. Vérifiez logs détaillés
|
| 459 |
+
2. Testez avec texte plus court
|
| 460 |
+
3. Vérifiez clé OpenAI si utilisée
|
| 461 |
+
|
| 462 |
+
**Timeout sur requêtes**
|
| 463 |
+
1. Réduisez taille texte input
|
| 464 |
+
2. Optimisez paramètres chunking
|
| 465 |
+
3. Vérifiez limits dans config.yaml
|
| 466 |
+
|
| 467 |
+
### 9.3 Problèmes performance
|
| 468 |
+
|
| 469 |
+
**Lenteur excessive**
|
| 470 |
+
```yaml
|
| 471 |
+
# Dans config.yaml - Optimisations
|
| 472 |
+
performance:
|
| 473 |
+
memory:
|
| 474 |
+
enable_garbage_collection: true
|
| 475 |
+
caching:
|
| 476 |
+
enabled: true
|
| 477 |
+
concurrency:
|
| 478 |
+
max_workers: 1
|
| 479 |
+
```
|
| 480 |
+
|
| 481 |
+
**Consommation mémoire élevée**
|
| 482 |
+
```python
|
| 483 |
+
# Monitoring dans logs
|
| 484 |
+
memory_usage = pipeline.get_memory_usage()
|
| 485 |
+
if memory_usage["memory_percent"] > 90:
|
| 486 |
+
# Cleanup automatique
|
| 487 |
+
await pipeline.cleanup()
|
| 488 |
+
```
|
| 489 |
+
|
| 490 |
+
### 9.4 Support et aide
|
| 491 |
+
|
| 492 |
+
**Ressources officielles :**
|
| 493 |
+
- [HF Spaces Documentation](https://huggingface.co/docs/hub/spaces)
|
| 494 |
+
- [Docker Support](https://huggingface.co/docs/hub/spaces-sdks-docker)
|
| 495 |
+
- [Community Forum](https://discuss.huggingface.co/)
|
| 496 |
+
|
| 497 |
+
**Debug avancé :**
|
| 498 |
+
```bash
|
| 499 |
+
# Logs détaillés depuis interface HF
|
| 500 |
+
# Ou accès direct via API
|
| 501 |
+
curl https://huggingface.co/api/spaces/VOTRE_USERNAME/chunking-intelligent-api/logs
|
| 502 |
+
```
|
| 503 |
+
|
| 504 |
+
---
|
| 505 |
+
|
| 506 |
+
## 10. 🚀 Optimisations Avancées
|
| 507 |
+
|
| 508 |
+
### 10.1 Optimisations mémoire
|
| 509 |
+
|
| 510 |
+
**Ajouts dans config.yaml :**
|
| 511 |
+
```yaml
|
| 512 |
+
performance:
|
| 513 |
+
memory:
|
| 514 |
+
max_memory_mb: 1800 # Sécurité pour 2GB Space
|
| 515 |
+
cleanup_interval: 50 # Nettoyage plus fréquent
|
| 516 |
+
cache_size_limit: 500 # Cache réduit
|
| 517 |
+
```
|
| 518 |
+
|
| 519 |
+
### 10.2 Optimisations vitesse
|
| 520 |
+
|
| 521 |
+
**Pre-loading modèles dans Dockerfile :**
|
| 522 |
+
```dockerfile
|
| 523 |
+
# Ajout après installation requirements
|
| 524 |
+
RUN python -c "from transformers import AutoModel; AutoModel.from_pretrained('BAAI/bge-small-en-v1.5', cache_dir='/app/cache')"
|
| 525 |
+
```
|
| 526 |
+
|
| 527 |
+
### 10.3 Monitoring avancé
|
| 528 |
+
|
| 529 |
+
**Ajout endpoint metrics :**
|
| 530 |
+
```python
|
| 531 |
+
@app.get("/metrics")
|
| 532 |
+
async def get_metrics():
|
| 533 |
+
return {
|
| 534 |
+
"memory": pipeline.get_memory_usage(),
|
| 535 |
+
"health": await pipeline.health_check(),
|
| 536 |
+
"cache_stats": {
|
| 537 |
+
"embedding_cache": len(pipeline._embedding_cache),
|
| 538 |
+
"concept_cache": len(pipeline._concept_cache)
|
| 539 |
+
}
|
| 540 |
+
}
|
| 541 |
+
```
|
| 542 |
+
|
| 543 |
+
### 10.4 Auto-scaling (si upgrade vers Space Pro)
|
| 544 |
+
|
| 545 |
+
```yaml
|
| 546 |
+
# config.yaml pour Spaces payants
|
| 547 |
+
performance:
|
| 548 |
+
concurrency:
|
| 549 |
+
max_workers: 2 # Plus de workers
|
| 550 |
+
adaptive_scaling: true
|
| 551 |
+
memory:
|
| 552 |
+
max_memory_mb: 7000 # Plus de RAM
|
| 553 |
+
```
|
| 554 |
+
|
| 555 |
+
---
|
| 556 |
+
|
| 557 |
+
## 🎉 Félicitations !
|
| 558 |
+
|
| 559 |
+
Votre **Chunking Sémantique Intelligent API** est maintenant déployée et fonctionnelle sur Hugging Face Space !
|
| 560 |
+
|
| 561 |
+
**🔗 URLs importantes :**
|
| 562 |
+
- **API principale :** `https://VOTRE_USERNAME-chunking-intelligent-api.hf.space`
|
| 563 |
+
- **Endpoint chunking :** `/chunk`
|
| 564 |
+
- **Health check :** `/health`
|
| 565 |
+
- **Métriques :** `/metrics`
|
| 566 |
+
|
| 567 |
+
**📋 Prochaines étapes recommandées :**
|
| 568 |
+
1. ✅ Tester avec vos vrais contenus
|
| 569 |
+
2. ✅ Intégrer dans n8n
|
| 570 |
+
3. ✅ Connecter à Mem0/Qdrant
|
| 571 |
+
4. ✅ Configurer Obsidian
|
| 572 |
+
5. ✅ Créer vos agents spécialisés
|
| 573 |
+
|
| 574 |
+
**🚀 Votre système de Second Cerveau intelligent est prêt !**
|
requirements.txt
ADDED
|
@@ -0,0 +1,126 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# ============================================================================
|
| 2 |
+
# 📦 requirements.txt - VERSION FINALE FUSIONNÉE v4.0
|
| 3 |
+
# ============================================================================
|
| 4 |
+
# 🧠 Projet : Chunking sémantique intelligent récursif hiérarchique
|
| 5 |
+
# ✅ SOLUTION : Combinaison fichier original + corrections + CustomRecursiveChunker
|
| 6 |
+
# ✅ COMPATIBLE : HuggingFace Spaces + LlamaIndex v0.12 + Chonkie stable
|
| 7 |
+
# ============================================================================
|
| 8 |
+
|
| 9 |
+
# ========= 🚀 API + SERVEUR =========
|
| 10 |
+
fastapi==0.115.6
|
| 11 |
+
uvicorn[standard]==0.32.1
|
| 12 |
+
python-multipart==0.0.19
|
| 13 |
+
|
| 14 |
+
# ========= 📋 CONFIGURATION + VALIDATION =========
|
| 15 |
+
pydantic==2.10.3
|
| 16 |
+
pydantic-settings==2.1.0
|
| 17 |
+
python-dotenv==1.0.0
|
| 18 |
+
PyYAML==6.0.1
|
| 19 |
+
|
| 20 |
+
# ========= 🧠 LLAMAINDEX MODULAIRE v0.12 =========
|
| 21 |
+
# ✅ CORRECTION: Versions compatibles épinglées
|
| 22 |
+
llama-index-core==0.12.8
|
| 23 |
+
llama-index-embeddings-huggingface==0.4.1
|
| 24 |
+
llama-index-llms-huggingface==0.4.1
|
| 25 |
+
llama-index-readers-file==0.4.9
|
| 26 |
+
|
| 27 |
+
# ✅ OPTION LLM LOCAL : Uncomment si tu veux llama-cpp
|
| 28 |
+
# llama-index-llms-llama-cpp==0.3.0
|
| 29 |
+
# llama-cpp-python==0.2.35
|
| 30 |
+
|
| 31 |
+
# ========= 🦛 CHUNKING + MODELS =========
|
| 32 |
+
# ✅ CORRECTION MAJEURE: Version stable publique de Chonkie
|
| 33 |
+
chonkie==1.0.8
|
| 34 |
+
|
| 35 |
+
# ✅ MODELS: Versions compatibles HF Spaces
|
| 36 |
+
transformers>=4.46.0,<4.50.0
|
| 37 |
+
sentence-transformers==3.3.1
|
| 38 |
+
torch>=2.0.0,<2.5.0
|
| 39 |
+
tokenizers>=0.20.0,<0.21.0
|
| 40 |
+
|
| 41 |
+
# ✅ SÉCURITÉ: Évite les conflits avec HF Spaces
|
| 42 |
+
accelerate>=0.20.0,<0.35.0
|
| 43 |
+
safetensors>=0.3.0,<0.5.0
|
| 44 |
+
huggingface-hub>=0.19.0,<0.26.0
|
| 45 |
+
|
| 46 |
+
# ========= 📝 TRAITEMENT TEXTE =========
|
| 47 |
+
beautifulsoup4==4.12.3
|
| 48 |
+
markdownify==0.11.6
|
| 49 |
+
lxml==4.9.3
|
| 50 |
+
regex==2023.10.3
|
| 51 |
+
bleach==6.1.0
|
| 52 |
+
|
| 53 |
+
# ========= 📊 DONNÉES + CALCULS =========
|
| 54 |
+
numpy>=1.24.0,<2.0.0
|
| 55 |
+
pandas>=2.0.0,<2.3.0
|
| 56 |
+
orjson==3.9.10
|
| 57 |
+
|
| 58 |
+
# ========= 🔬 MACHINE LEARNING SUPPORT =========
|
| 59 |
+
scikit-learn>=1.3.0,<1.6.0
|
| 60 |
+
matplotlib>=3.7.0,<3.10.0
|
| 61 |
+
seaborn>=0.12.0,<0.14.0
|
| 62 |
+
|
| 63 |
+
# ========= 🌐 RÉSEAU + HTTP =========
|
| 64 |
+
aiohttp==3.9.1
|
| 65 |
+
httpcore==1.0.2
|
| 66 |
+
httpx==0.25.2
|
| 67 |
+
tenacity==8.2.3
|
| 68 |
+
validators==0.22.0
|
| 69 |
+
requests>=2.31.0,<3.0.0
|
| 70 |
+
|
| 71 |
+
# ========= 💾 SYSTÈME + CACHE =========
|
| 72 |
+
psutil==5.9.6
|
| 73 |
+
cachetools==5.3.2
|
| 74 |
+
redis>=5.0.0,<6.0.0
|
| 75 |
+
diskcache>=5.6.0,<6.0.0
|
| 76 |
+
|
| 77 |
+
# ========= 📝 DOCUMENTATION + EXPORT =========
|
| 78 |
+
markdown>=3.5.0,<4.0.0
|
| 79 |
+
jinja2>=3.1.0,<4.0.0
|
| 80 |
+
|
| 81 |
+
# ========= 🐛 MONITORING + LOGGING =========
|
| 82 |
+
loguru>=0.7.0,<1.0.0
|
| 83 |
+
|
| 84 |
+
# ========= 🔐 SÉCURITÉ =========
|
| 85 |
+
cryptography>=41.0.0,<43.0.0
|
| 86 |
+
|
| 87 |
+
# ========= 🧪 TESTS + DÉVELOPPEMENT =========
|
| 88 |
+
pytest==7.4.3
|
| 89 |
+
pytest-asyncio>=0.21.0
|
| 90 |
+
|
| 91 |
+
# ========= 🎯 OPTIMISATIONS HF SPACES =========
|
| 92 |
+
# ✅ Versions épinglées pour éviter les conflits d'environnement
|
| 93 |
+
wheel>=0.40.0
|
| 94 |
+
setuptools>=65.0.0
|
| 95 |
+
|
| 96 |
+
# ========= 📌 NOTES IMPORTANTES =========
|
| 97 |
+
#
|
| 98 |
+
# 🔧 CORRECTIONS APPLIQUÉES:
|
| 99 |
+
# ✅ chonkie==1.0.8 (stable PyPI) au lieu de Git @commit
|
| 100 |
+
# ✅ LlamaIndex v0.12.8 (modulaire) avec imports corrigés
|
| 101 |
+
# ✅ Transformers 4.46+ compatible avec sentence-transformers 3.3.1
|
| 102 |
+
# ✅ Torch 2.x stable avec accélération CPU/GPU
|
| 103 |
+
# ✅ Versions épinglées pour HuggingFace Spaces
|
| 104 |
+
#
|
| 105 |
+
# 🚀 FONCTIONNALITÉS SUPPORTÉES:
|
| 106 |
+
# ✅ CustomRecursiveChunker (chunking hiérarchique intelligent)
|
| 107 |
+
# ✅ Chonkie SemanticChunker (si import réussit)
|
| 108 |
+
# ✅ Relations bidirectionnelles parent-enfant
|
| 109 |
+
# ✅ Export Obsidian format [[Titre]], id
|
| 110 |
+
# ✅ Base connaissance agents spécialisés
|
| 111 |
+
# ✅ Embeddings sémantiques via SentenceTransformer
|
| 112 |
+
# ✅ Pipeline FastAPI complet
|
| 113 |
+
#
|
| 114 |
+
# ⚠️ COMPATIBILITÉ HUGGINGFACE SPACES:
|
| 115 |
+
# - Utilise /tmp pour cache (write-accessible)
|
| 116 |
+
# - CPU uniquement sur HF Spaces gratuits
|
| 117 |
+
# - Mémoire limitée à ~2GB RAM
|
| 118 |
+
# - Pas de GPU sur plan gratuit
|
| 119 |
+
#
|
| 120 |
+
# 🔄 VERSIONS TESTÉES ET VALIDÉES:
|
| 121 |
+
# - Python 3.10+
|
| 122 |
+
# - Ubuntu 20.04+ / Debian 11+
|
| 123 |
+
# - HuggingFace Spaces (CPU)
|
| 124 |
+
# - Docker containers
|
| 125 |
+
#
|
| 126 |
+
# ============================================================================
|
schemas.py
ADDED
|
@@ -0,0 +1,234 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from pydantic import BaseModel, Field, validator
|
| 2 |
+
from typing import List, Dict, Any, Optional, Union
|
| 3 |
+
from enum import Enum
|
| 4 |
+
|
| 5 |
+
class ContentType(str, Enum):
|
| 6 |
+
"""Types de contenu supportés"""
|
| 7 |
+
TEXT = "text"
|
| 8 |
+
PDF = "pdf"
|
| 9 |
+
YOUTUBE = "youtube"
|
| 10 |
+
EMAIL = "email"
|
| 11 |
+
WEB = "web"
|
| 12 |
+
DOCUMENT = "document"
|
| 13 |
+
|
| 14 |
+
class ChunkLevel(str, Enum):
|
| 15 |
+
"""Niveaux hiérarchiques des chunks"""
|
| 16 |
+
DOCUMENT = "document" # Niveau 0 - Document entier
|
| 17 |
+
CHAPTER = "chapter" # Niveau 1 - Chapitre/Section principale
|
| 18 |
+
SECTION = "section" # Niveau 2 - Sous-section
|
| 19 |
+
SUBSECTION = "subsection" # Niveau 3 - Sous-sous-section
|
| 20 |
+
CONCEPT = "concept" # Niveau 4 - Concept/Idée
|
| 21 |
+
DETAIL = "detail" # Niveau 5 - Détail/Exemple
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
class ChunkRequest(BaseModel):
|
| 25 |
+
"""Modèle de requête pour le chunking"""
|
| 26 |
+
text: str = Field(
|
| 27 |
+
...,
|
| 28 |
+
min_length=10,
|
| 29 |
+
max_length=500000, # Limite pour Space gratuit
|
| 30 |
+
description="Texte à découper en chunks sémantiques"
|
| 31 |
+
)
|
| 32 |
+
source_id: Optional[str] = Field(
|
| 33 |
+
None,
|
| 34 |
+
description="Identifiant unique de la source"
|
| 35 |
+
)
|
| 36 |
+
titre: Optional[str] = Field(
|
| 37 |
+
None,
|
| 38 |
+
max_length=200,
|
| 39 |
+
description="Titre du document"
|
| 40 |
+
)
|
| 41 |
+
source: Optional[str] = Field(
|
| 42 |
+
None,
|
| 43 |
+
description="URL ou référence de la source"
|
| 44 |
+
)
|
| 45 |
+
type: Optional[ContentType] = Field(
|
| 46 |
+
ContentType.TEXT,
|
| 47 |
+
description="Type de contenu"
|
| 48 |
+
)
|
| 49 |
+
|
| 50 |
+
# Paramètres de chunking optionnels
|
| 51 |
+
chunk_sizes: Optional[List[int]] = Field(
|
| 52 |
+
[2048, 512, 128],
|
| 53 |
+
description="Tailles hiérarchiques en tokens"
|
| 54 |
+
)
|
| 55 |
+
overlap_size: Optional[int] = Field(
|
| 56 |
+
20,
|
| 57 |
+
ge=0,
|
| 58 |
+
le=100,
|
| 59 |
+
description="Chevauchement entre chunks"
|
| 60 |
+
)
|
| 61 |
+
buffer_size: Optional[int] = Field(
|
| 62 |
+
None, # Sera calculé dynamiquement
|
| 63 |
+
ge=1,
|
| 64 |
+
le=10,
|
| 65 |
+
description="Nombre de phrases par groupe (dynamique si None)"
|
| 66 |
+
)
|
| 67 |
+
breakpoint_threshold: Optional[float] = Field(
|
| 68 |
+
90.0,
|
| 69 |
+
ge=50.0,
|
| 70 |
+
le=99.0,
|
| 71 |
+
description="Seuil de similarité sémantique (percentile)"
|
| 72 |
+
)
|
| 73 |
+
include_metadata: Optional[bool] = Field(
|
| 74 |
+
True,
|
| 75 |
+
description="Inclure les métadonnées enrichies"
|
| 76 |
+
)
|
| 77 |
+
detect_structure: Optional[bool] = Field(
|
| 78 |
+
True,
|
| 79 |
+
description="Détecter automatiquement la structure (titres, sections)"
|
| 80 |
+
)
|
| 81 |
+
|
| 82 |
+
@validator('text')
|
| 83 |
+
def validate_text(cls, v):
|
| 84 |
+
if not v or not v.strip():
|
| 85 |
+
raise ValueError('Le texte ne peut pas être vide')
|
| 86 |
+
return v.strip()
|
| 87 |
+
|
| 88 |
+
# ✅ Patch pour Pydantic 2.x
|
| 89 |
+
model_config = {
|
| 90 |
+
"protected_namespaces": ()
|
| 91 |
+
}
|
| 92 |
+
|
| 93 |
+
|
| 94 |
+
class ChunkMetadata(BaseModel):
|
| 95 |
+
"""Métadonnées enrichies d'un chunk"""
|
| 96 |
+
chunk_id: str = Field(..., description="Identifiant unique du chunk")
|
| 97 |
+
level: int = Field(..., ge=0, le=5, description="Niveau hiérarchique (0-5)")
|
| 98 |
+
level_name: ChunkLevel = Field(..., description="Nom du niveau")
|
| 99 |
+
|
| 100 |
+
# Relations hiérarchiques
|
| 101 |
+
parent_id: Optional[str] = Field(None, description="ID du chunk parent")
|
| 102 |
+
children_ids: List[str] = Field(default_factory=list, description="IDs des chunks enfants")
|
| 103 |
+
|
| 104 |
+
# Relations de navigation
|
| 105 |
+
prev_id: Optional[str] = Field(None, description="ID du chunk précédent")
|
| 106 |
+
next_id: Optional[str] = Field(None, description="ID du chunk suivant")
|
| 107 |
+
|
| 108 |
+
# Position dans le document
|
| 109 |
+
global_index: int = Field(..., ge=0, description="Position globale dans le document")
|
| 110 |
+
local_index: int = Field(..., ge=0, description="Position locale dans le niveau")
|
| 111 |
+
|
| 112 |
+
# Informations de source
|
| 113 |
+
source_id: Optional[str] = Field(None, description="ID de la source")
|
| 114 |
+
source_title: Optional[str] = Field(None, description="Titre de la source")
|
| 115 |
+
source_url: Optional[str] = Field(None, description="URL de la source")
|
| 116 |
+
content_type: ContentType = Field(ContentType.TEXT, description="Type de contenu")
|
| 117 |
+
|
| 118 |
+
# Métadonnées sémantiques
|
| 119 |
+
tokens_count: int = Field(..., ge=0, description="Nombre de tokens")
|
| 120 |
+
sentences_count: int = Field(..., ge=0, description="Nombre de phrases")
|
| 121 |
+
|
| 122 |
+
# Extraction intelligente
|
| 123 |
+
detected_title: Optional[str] = Field(None, description="Titre détecté automatiquement")
|
| 124 |
+
main_concepts: List[str] = Field(default_factory=list, description="Concepts principaux extraits")
|
| 125 |
+
keywords: List[str] = Field(default_factory=list, description="Mots-clés importants")
|
| 126 |
+
|
| 127 |
+
# Classification automatique
|
| 128 |
+
chunk_type: Optional[str] = Field(None, description="Type de chunk (concept, exemple, principe, etc.)")
|
| 129 |
+
confidence_score: Optional[float] = Field(None, ge=0.0, le=1.0, description="Score de confiance du découpage")
|
| 130 |
+
|
| 131 |
+
# Contexte sémantique
|
| 132 |
+
contextual_summary: Optional[str] = Field(None, description="Résumé contextuel local")
|
| 133 |
+
related_chunks: List[str] = Field(default_factory=list, description="Chunks sémantiquement reliés")
|
| 134 |
+
|
| 135 |
+
# ✅ Patch pour Pydantic 2.x
|
| 136 |
+
model_config = {
|
| 137 |
+
"protected_namespaces": ()
|
| 138 |
+
}
|
| 139 |
+
|
| 140 |
+
class SemanticChunk(BaseModel):
|
| 141 |
+
"""Représentation d'un chunk sémantique avec sa hiérarchie"""
|
| 142 |
+
content: str = Field(..., description="Contenu textuel du chunk")
|
| 143 |
+
metadata: ChunkMetadata = Field(..., description="Métadonnées enrichies")
|
| 144 |
+
|
| 145 |
+
# Embedding (optionnel pour économiser la bande passante)
|
| 146 |
+
embedding: Optional[List[float]] = Field(None, description="Vecteur d'embedding")
|
| 147 |
+
|
| 148 |
+
# Relations calculées
|
| 149 |
+
similarity_scores: Dict[str, float] = Field(
|
| 150 |
+
default_factory=dict,
|
| 151 |
+
description="Scores de similarité avec d'autres chunks"
|
| 152 |
+
)
|
| 153 |
+
|
| 154 |
+
# ✅ Patch pour Pydantic 2.x
|
| 155 |
+
model_config = {
|
| 156 |
+
"protected_namespaces": ()
|
| 157 |
+
}
|
| 158 |
+
|
| 159 |
+
class HierarchyLevel(BaseModel):
|
| 160 |
+
"""Représentation d'un niveau hiérarchique"""
|
| 161 |
+
level: int = Field(..., ge=0, le=5)
|
| 162 |
+
level_name: ChunkLevel
|
| 163 |
+
chunks: List[SemanticChunk]
|
| 164 |
+
total_tokens: int
|
| 165 |
+
avg_chunk_size: float
|
| 166 |
+
|
| 167 |
+
# ✅ Patch pour Pydantic 2.x
|
| 168 |
+
model_config = {
|
| 169 |
+
"protected_namespaces": ()
|
| 170 |
+
}
|
| 171 |
+
|
| 172 |
+
class ChunkResponse(BaseModel):
|
| 173 |
+
"""Réponse complète du chunking"""
|
| 174 |
+
# Résultat principal
|
| 175 |
+
chunks: List[SemanticChunk] = Field(..., description="Liste des chunks générés")
|
| 176 |
+
|
| 177 |
+
# Structure hiérarchique
|
| 178 |
+
hierarchy: List[HierarchyLevel] = Field(..., description="Structure hiérarchique complète")
|
| 179 |
+
|
| 180 |
+
# Statistiques globales
|
| 181 |
+
total_chunks: int = Field(..., ge=0, description="Nombre total de chunks")
|
| 182 |
+
total_tokens: int = Field(..., ge=0, description="Nombre total de tokens")
|
| 183 |
+
processing_time: float = Field(..., ge=0, description="Temps de traitement en secondes")
|
| 184 |
+
|
| 185 |
+
# Informations de source
|
| 186 |
+
source_metadata: Dict[str, Any] = Field(default_factory=dict, description="Métadonnées de la source")
|
| 187 |
+
|
| 188 |
+
# Relations découvertes
|
| 189 |
+
concept_graph: Dict[str, Any] = Field(
|
| 190 |
+
default_factory=dict,
|
| 191 |
+
description="Graphe des concepts découverts"
|
| 192 |
+
)
|
| 193 |
+
|
| 194 |
+
|
| 195 |
+
# Export pour Second Cerveau
|
| 196 |
+
obsidian_export: Optional[Dict[str, Any]] = Field(
|
| 197 |
+
None,
|
| 198 |
+
description="Export formaté pour Obsidian"
|
| 199 |
+
)
|
| 200 |
+
|
| 201 |
+
# Export pour agents spécialisés
|
| 202 |
+
agent_knowledge: Optional[Dict[str, Any]] = Field(
|
| 203 |
+
None,
|
| 204 |
+
description="Connaissances formatées pour agents IA"
|
| 205 |
+
)
|
| 206 |
+
|
| 207 |
+
# ✅ Patch pour Pydantic 2.x
|
| 208 |
+
model_config = {
|
| 209 |
+
"protected_namespaces": ()
|
| 210 |
+
}
|
| 211 |
+
|
| 212 |
+
class HealthCheckResponse(BaseModel):
|
| 213 |
+
"""Réponse du health check"""
|
| 214 |
+
status: str
|
| 215 |
+
pipeline_ready: bool
|
| 216 |
+
memory_usage: Optional[Dict[str, Any]] = None
|
| 217 |
+
last_check: Optional[str] = None
|
| 218 |
+
|
| 219 |
+
# ✅ Patch pour Pydantic 2.x
|
| 220 |
+
model_config = {
|
| 221 |
+
"protected_namespaces": ()
|
| 222 |
+
}
|
| 223 |
+
|
| 224 |
+
class ErrorResponse(BaseModel):
|
| 225 |
+
"""Réponse d'erreur standardisée"""
|
| 226 |
+
error: str
|
| 227 |
+
detail: Optional[str] = None
|
| 228 |
+
error_code: Optional[str] = None
|
| 229 |
+
timestamp: Optional[str] = None
|
| 230 |
+
|
| 231 |
+
# ✅ Patch pour Pydantic 2.x
|
| 232 |
+
model_config = {
|
| 233 |
+
"protected_namespaces": ()
|
| 234 |
+
}
|