File size: 10,464 Bytes
f866820
 
 
 
 
7e07738
 
 
f866820
 
 
 
7e07738
f866820
 
 
 
 
7e07738
f866820
 
7e07738
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f866820
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7e07738
 
 
 
 
f866820
 
 
 
 
 
 
 
 
7e07738
 
 
 
f866820
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7e07738
 
 
 
 
 
 
 
 
 
 
 
 
 
f866820
 
 
 
 
 
 
 
 
 
7e07738
 
 
 
 
 
 
 
 
 
 
 
f866820
 
 
 
 
 
 
 
 
 
 
 
 
7e07738
 
f866820
 
 
7e07738
 
 
 
f866820
 
 
 
 
 
 
 
 
 
 
 
 
7e07738
 
f866820
 
 
 
 
 
 
 
 
 
 
 
 
7e07738
f866820
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7e07738
 
 
 
 
 
 
 
 
 
 
 
 
 
f866820
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
"""
Ingestion API for UI integration.

Provides functions to ingest documents from a directory
and optionally sync to Pinecone.

Supports both legacy markdown-only loading and multi-format
loading via Docling.
"""

import json
import os
import logging
from pathlib import Path
from typing import Dict, Any, List, Optional
from dataclasses import dataclass

from src.ingestion.load_docs import load_markdown_docs
from src.ingestion.chunker import chunk_documents, chunk_documents_with_structure
from src.ingestion.embeddings import batch_embed_chunks

logger = logging.getLogger(__name__)

# Try to import Docling loader (optional dependency)
DOCLING_AVAILABLE = False
try:
    from src.ingestion.docling_loader import (
        load_documents_with_docling,
        convert_to_legacy_format,
        SUPPORTED_EXTENSIONS
    )
    DOCLING_AVAILABLE = True
except ImportError:
    logger.info("Docling not available, using markdown-only loader")
    SUPPORTED_EXTENSIONS = {".md", ".markdown"}


@dataclass
class IngestionResult:
    """Result of document ingestion."""
    status: str
    documents: int
    chunks: int
    output_path: str
    errors: List[str]


@dataclass
class SyncResult:
    """Result of Pinecone sync."""
    status: str
    vectors_upserted: int
    errors: List[str]


def ingest_from_directory(
    docs_dir: str,
    output_path: str = "data/chunks.jsonl",
    provider: str = "sentence-transformers",
    dim: int = 384,
    use_docling: bool = True,
    extensions: Optional[List[str]] = None,
    use_structure: bool = True,
    recursive: bool = False
) -> IngestionResult:
    """
    Ingest documents from a directory and save to chunks.jsonl.

    Args:
        docs_dir: Path to directory containing documents
        output_path: Path to save chunks.jsonl
        provider: Embedding provider ("sentence-transformers" or "local")
        dim: Embedding dimension
        use_docling: Use Docling for multi-format parsing (if available)
        extensions: File extensions to process (None = all supported)
        use_structure: Use structure-aware chunking (requires Docling)
        recursive: Search subdirectories recursively

    Returns:
        IngestionResult with status and counts
    """
    errors = []

    # Validate directory
    if not os.path.isdir(docs_dir):
        return IngestionResult(
            status="error",
            documents=0,
            chunks=0,
            output_path=output_path,
            errors=[f"Directory not found: {docs_dir}"]
        )

    try:
        # Choose loader based on availability and preference
        if use_docling and DOCLING_AVAILABLE:
            logger.info("Using Docling for multi-format document loading")
            parsed_docs = load_documents_with_docling(
                docs_dir,
                extensions=extensions,
                recursive=recursive
            )
            docs = convert_to_legacy_format(parsed_docs)
        else:
            logger.info("Using legacy markdown loader")
            docs = load_markdown_docs(docs_dir)
            use_structure = False  # No structure without Docling

        if not docs:
            return IngestionResult(
                status="warning",
                documents=0,
                chunks=0,
                output_path=output_path,
                errors=["No documents found in directory"]
            )

        # Count successful loads
        doc_count = len([d for d in docs if d.get("status") == "OK"])

        # Chunk documents (structure-aware or legacy)
        if use_structure and DOCLING_AVAILABLE:
            chunks = chunk_documents_with_structure(
                docs,
                max_tokens=300,
                overlap=50,
                use_structure=True
            )
        else:
            chunks = chunk_documents(docs, max_tokens=300, overlap=50)

        if not chunks:
            return IngestionResult(
                status="warning",
                documents=doc_count,
                chunks=0,
                output_path=output_path,
                errors=["No chunks generated from documents"]
            )

        # Generate embeddings
        embedded = batch_embed_chunks(chunks, provider=provider, dim=dim)

        # Merge text and metadata back into embedded chunks
        chunk_map = {(c["filename"], c["chunk_id"]): c for c in chunks}
        for e in embedded:
            key = (e["filename"], e["chunk_id"])
            if key in chunk_map:
                src = chunk_map[key]
                e["text"] = src.get("text", "")
                e["element_type"] = src.get("element_type", "text")
                e["section_heading"] = src.get("section_heading", "")

        # Save to file
        save_path = Path(output_path)
        save_path.parent.mkdir(parents=True, exist_ok=True)

        with save_path.open("w", encoding="utf-8") as fh:
            for e in embedded:
                obj = {
                    "id": f"{e['filename']}::{e['chunk_id']}",
                    "filename": e["filename"],
                    "chunk_id": e["chunk_id"],
                    "text": e.get("text", ""),
                    "chars": e.get("chars", 0),
                    "element_type": e.get("element_type", "text"),
                    "section_heading": e.get("section_heading", ""),
                    "embedding": e["embedding"]
                }
                fh.write(json.dumps(obj, ensure_ascii=False) + "\n")

        return IngestionResult(
            status="success",
            documents=doc_count,
            chunks=len(embedded),
            output_path=output_path,
            errors=errors
        )

    except Exception as e:
        logger.exception("Ingestion failed")
        return IngestionResult(
            status="error",
            documents=0,
            chunks=0,
            output_path=output_path,
            errors=[str(e)]
        )


def sync_to_pinecone(
    chunks_path: str = "data/chunks.jsonl",
    index_name: str = None,
    batch_size: int = 100
) -> SyncResult:
    """
    Upload embeddings from chunks.jsonl to Pinecone.

    Args:
        chunks_path: Path to chunks.jsonl file
        index_name: Pinecone index name (uses config default if None)
        batch_size: Number of vectors to upsert per batch

    Returns:
        SyncResult with status and count
    """
    errors = []

    # Validate file exists
    if not os.path.isfile(chunks_path):
        return SyncResult(
            status="error",
            vectors_upserted=0,
            errors=[f"Chunks file not found: {chunks_path}"]
        )

    try:
        # Load Pinecone config
        import src.config as cfg
        from pinecone import Pinecone

        if index_name is None:
            index_name = cfg.PINECONE_INDEX_NAME

        # Initialize Pinecone
        pc = Pinecone(api_key=cfg.PINECONE_API_KEY)
        idx_meta = pc.describe_index(index_name)

        # Get host
        host = getattr(idx_meta, "host", None) or idx_meta.get("host")
        if not host:
            return SyncResult(
                status="error",
                vectors_upserted=0,
                errors=[f"Could not get host for index: {index_name}"]
            )

        index = pc.Index(host=host)

        # Load chunks
        chunks = []
        with open(chunks_path, "r", encoding="utf-8") as f:
            for line in f:
                line = line.strip()
                if line:
                    chunks.append(json.loads(line))

        if not chunks:
            return SyncResult(
                status="warning",
                vectors_upserted=0,
                errors=["No chunks to upload"]
            )

        # Prepare vectors
        vectors = []
        for chunk in chunks:
            embedding = chunk.get("embedding", [])
            if not embedding:
                continue

            vectors.append({
                "id": chunk["id"],
                "values": embedding,
                "metadata": {
                    "filename": chunk.get("filename", ""),
                    "chunk_id": chunk.get("chunk_id", 0),
                    "text": chunk.get("text", "")[:1000]  # Limit metadata size
                }
            })

        # Upsert in batches
        upserted = 0
        for i in range(0, len(vectors), batch_size):
            batch = vectors[i:i + batch_size]
            try:
                index.upsert(vectors=batch)
                upserted += len(batch)
            except Exception as e:
                errors.append(f"Batch {i // batch_size} failed: {str(e)[:100]}")

        return SyncResult(
            status="success" if not errors else "partial",
            vectors_upserted=upserted,
            errors=errors
        )

    except Exception as e:
        return SyncResult(
            status="error",
            vectors_upserted=0,
            errors=[str(e)]
        )


def get_supported_formats() -> Dict[str, Any]:
    """
    Get information about supported document formats.

    Returns:
        Dict with docling availability and supported extensions
    """
    return {
        "docling_available": DOCLING_AVAILABLE,
        "supported_extensions": list(SUPPORTED_EXTENSIONS),
        "loader": "docling" if DOCLING_AVAILABLE else "markdown-only"
    }


def get_index_status(chunks_path: str = "data/chunks.jsonl") -> Dict[str, Any]:
    """
    Get status of the current index.

    Args:
        chunks_path: Path to chunks.jsonl file

    Returns:
        Dict with chunk count, document count, and file info
    """
    if not os.path.isfile(chunks_path):
        return {
            "exists": False,
            "chunks": 0,
            "documents": 0,
            "path": chunks_path
        }

    try:
        chunks = 0
        documents = set()

        with open(chunks_path, "r", encoding="utf-8") as f:
            for line in f:
                line = line.strip()
                if line:
                    obj = json.loads(line)
                    chunks += 1
                    documents.add(obj.get("filename", ""))

        return {
            "exists": True,
            "chunks": chunks,
            "documents": len(documents),
            "path": chunks_path
        }

    except Exception as e:
        return {
            "exists": True,
            "chunks": 0,
            "documents": 0,
            "path": chunks_path,
            "error": str(e)
        }