File size: 11,881 Bytes
a83c934
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
#!/usr/bin/env python3
"""
Book content embedding script

Reads markdown files from docs/ (including all nested subdirectories), chunks content by headings or word count,
generates embeddings with OpenAI, and uploads to Qdrant vector database.

Usage:
    python backend/scripts/embed_book_content.py --book-path docs/ --collection-name humanoid-robotics-book-v1
"""
import argparse
import asyncio
import os
import re
import sys
from pathlib import Path
from typing import List, Dict, Any
from uuid import uuid4

# Add parent directory to path for imports
sys.path.insert(0, str(Path(__file__).parent.parent))

from openai import AsyncOpenAI
from qdrant_client import AsyncQdrantClient
from qdrant_client.models import Distance, VectorParams, PointStruct

from src.config.settings import settings
from src.utils.logger import setup_logging, get_logger

setup_logging(level="INFO")
logger = get_logger(__name__)


class BookContentChunker:
    """Chunks markdown content intelligently by headings and word limits"""

    def __init__(self, chunk_size: int = 500, overlap: int = 50):
        """
        Initialize chunker

        Args:
            chunk_size: Target chunk size in words
            overlap: Word overlap between chunks
        """
        self.chunk_size = chunk_size
        self.overlap = overlap

    def chunk_markdown(self, content: str, file_path: str) -> List[Dict[str, Any]]:
        """
        Chunk markdown content by headings and word limits

        Args:
            content: Markdown file content
            file_path: Path to markdown file (for metadata)

        Returns:
            List of chunk dictionaries with content and metadata
        """
        chunks = []

        # Extract chapter/module name from file path
        path_obj = Path(file_path)
        chapter = self._extract_chapter_name(path_obj)

        # Split by headings (## and ###)
        sections = re.split(r'(^#{2,3}\s+.+$)', content, flags=re.MULTILINE)

        current_section_heading = "Introduction"
        current_content = []

        for i, section in enumerate(sections):
            # Check if this is a heading
            heading_match = re.match(r'^(#{2,3})\s+(.+)$', section.strip())

            if heading_match:
                # Save previous section if it has content
                if current_content:
                    section_chunks = self._chunk_section(
                        "\n".join(current_content),
                        chapter,
                        current_section_heading
                    )
                    chunks.extend(section_chunks)

                # Start new section
                current_section_heading = heading_match.group(2).strip()
                current_content = []
            else:
                # Accumulate content
                if section.strip():
                    current_content.append(section.strip())

        # Process last section
        if current_content:
            section_chunks = self._chunk_section(
                "\n".join(current_content),
                chapter,
                current_section_heading
            )
            chunks.extend(section_chunks)

        return chunks

    def _chunk_section(self, content: str, chapter: str, section: str) -> List[Dict[str, Any]]:
        """Chunk a section by word count with overlap"""
        words = content.split()
        chunks = []

        if len(words) <= self.chunk_size:
            # Section fits in one chunk
            chunks.append({
                "content": content,
                "chapter": chapter,
                "section": section,
                "heading": section,
                "chunk_index": 0,
                "word_count": len(words),
            })
        else:
            # Split into multiple chunks with overlap
            chunk_index = 0
            start = 0

            while start < len(words):
                end = start + self.chunk_size
                chunk_words = words[start:end]

                chunks.append({
                    "content": " ".join(chunk_words),
                    "chapter": chapter,
                    "section": section,
                    "heading": section,
                    "chunk_index": chunk_index,
                    "word_count": len(chunk_words),
                })

                chunk_index += 1
                start = end - self.overlap  # Overlap for context

        return chunks

    def _extract_chapter_name(self, path: Path) -> str:
        """Extract chapter/module name from file path"""
        # Try to extract from directory or filename
        parts = path.parts

        # Look for patterns like "module1-ros2", "Module 1", etc.
        for part in reversed(parts):
            if re.match(r'module[-\s]*\d+', part, re.IGNORECASE):
                return part.replace('-', ' ').title()

        # Fallback to filename without extension
        return path.stem.replace('-', ' ').replace('_', ' ').title()


class BookEmbedder:
    """Handles embedding generation and Qdrant upload"""

    def __init__(self, collection_name: str = "book_content"):
        """
        Initialize embedder

        Args:
            collection_name: Qdrant collection name
        """
        self.collection_name = collection_name
        self.openai_client = AsyncOpenAI(api_key=settings.openai_api_key)
        self.qdrant_client = AsyncQdrantClient(
            url=settings.qdrant_url,
            api_key=settings.qdrant_api_key,
            timeout=30,  # Set a higher timeout (seconds)
        )

    async def create_collection(self):
        """Create Qdrant collection if it doesn't exist, with improved connection error handling"""
        try:
            collections = await self.qdrant_client.get_collections()
        except Exception as e:
            logger.error(
                "\nCannot connect to Qdrant. "
                f"Error: {type(e).__name__}: {e}\n"
                "-> Please make sure your Qdrant server is running and accessible at the configured URL.\n"
                f"-> Current Qdrant URL: {settings.qdrant_url}"
            )
            logger.error("Exiting due to Qdrant connection failure.")
            import sys
            sys.exit(1)

        collection_names = [col.name for col in collections.collections]

        if self.collection_name not in collection_names:
            await self.qdrant_client.create_collection(
                collection_name=self.collection_name,
                vectors_config=VectorParams(
                    size=settings.vector_size,
                    distance=Distance.COSINE,
                ),
            )
            logger.info(f"Created collection: {self.collection_name}")
        else:
            logger.info(f"Collection already exists: {self.collection_name}")

    async def embed_text(self, text: str) -> List[float]:
        """
        Generate embedding for text using OpenAI

        Args:
            text: Text to embed

        Returns:
            Embedding vector
        """
        response = await self.openai_client.embeddings.create(
            model=settings.openai_embedding_model,
            input=text
        )
        return response.data[0].embedding

    async def upload_chunks(self, chunks: List[Dict[str, Any]], doc_version: str = "v1.0.0"):
        """
        Upload chunks with embeddings to Qdrant

        Args:
            chunks: List of chunk dictionaries
            doc_version: Document version identifier
        """
        logger.info(f"Uploading {len(chunks)} chunks to Qdrant...")

        points = []

        for i, chunk in enumerate(chunks):
            # Generate embedding
            embedding = await self.embed_text(chunk["content"])

            # Create point
            point = PointStruct(
                id=str(uuid4()),
                vector=embedding,
                payload={
                    "content": chunk["content"],
                    "chapter": chunk["chapter"],
                    "section": chunk["section"],
                    "heading": chunk["heading"],
                    "chunk_index": chunk["chunk_index"],
                    "word_count": chunk["word_count"],
                    "doc_version": doc_version,
                }
            )
            points.append(point)

            # Upload in batches of 100
            if len(points) >= 100:
                await self.qdrant_client.upsert(
                    collection_name=self.collection_name,
                    points=points
                )
                logger.info(f"Uploaded batch {i // 100 + 1} ({len(points)} points)")
                points = []

        # Upload remaining points
        if points:
            await self.qdrant_client.upsert(
                collection_name=self.collection_name,
                points=points
            )
            logger.info(f"Uploaded final batch ({len(points)} points)")

    async def close(self):
        """Close connections"""
        await self.qdrant_client.close()


def get_all_markdown_files_recursively(root_path: Path) -> List[Path]:
    """
    Find all markdown files recursively (as deep as needed) in the given root_path.
    This function will walk all subdirectories and return both *.md and *.mdx files.

    Args:
        root_path: Path to the root directory

    Returns:
        List[Path]: List of all markdown file Paths
    """
    md_files = list(root_path.rglob("*.md"))
    mdx_files = list(root_path.rglob("*.mdx"))
    all_files = md_files + mdx_files
    return [file for file in all_files if file.is_file() and 'node_modules' not in str(file)]


async def main():
    """Main embedding script"""
    parser = argparse.ArgumentParser(description="Embed book content into Qdrant")
    parser.add_argument(
        "--book-path",
        type=str,
        required=True,
        help="Path to book content directory (e.g., docs/)"
    )
    parser.add_argument(
        "--collection-name",
        type=str,
        default="humanoid-robotics-book-v1",
        help="Qdrant collection name"
    )
    parser.add_argument(
        "--doc-version",
        type=str,
        default="v1.0.0",
        help="Document version identifier"
    )

    args = parser.parse_args()

    # Initialize components
    chunker = BookContentChunker(chunk_size=500, overlap=50)
    embedder = BookEmbedder(collection_name=args.collection_name)

    try:
        # Create collection, with robust error handling in the constructor
        await embedder.create_collection()

        # Find all markdown files as deep as needed
        book_path = Path(args.book_path)
        md_files = get_all_markdown_files_recursively(book_path)
        logger.info(f"Found {len(md_files)} markdown files (.md and .mdx) recursively in all subdirectories")

        # Process each file
        all_chunks = []
        for md_file in md_files:
            logger.info(f"Processing: {md_file}")

            with open(md_file, 'r', encoding='utf-8') as f:
                content = f.read()

            chunks = chunker.chunk_markdown(content, str(md_file))
            all_chunks.extend(chunks)
            logger.info(f"  -> Generated {len(chunks)} chunks")

        logger.info(f"Total chunks: {len(all_chunks)}")

        # Upload to Qdrant
        await embedder.upload_chunks(all_chunks, doc_version=args.doc_version)

        logger.info("✅ Embedding complete!")

    finally:
        await embedder.close()


if __name__ == "__main__":
    # Run main in asyncio loop, but trap connection errors globally as a last resort
    try:
        asyncio.run(main())
    except Exception as e:
        logger.error(f"FATAL: Exception occurred: {type(e).__name__}: {e}")
        logger.error("Please check if Qdrant is running, accessible, and credentials are set correctly.")
        import sys
        sys.exit(1)