File size: 7,508 Bytes
4225666
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
from langchain_core.documents import Document
from pathlib import Path
from typing import Optional, List
from datetime import datetime, date
import uuid
import yaml
from app.services.text_splitter import TextSplitter
import json

# Allowed types for metadata cleaning
ALLOWED = (str, int, float, bool, list, type(None))

def get_references_v2(docs, threshold: float):
    results = []
    context = ""
    for doc in docs:
        _doc = doc.document
        _similarity = doc.fused_score
        # print(_similarity, threshold)
        if _similarity < threshold:
            continue
        metadata = _doc.metadata
        document = {
            "title": metadata.get("title", metadata.get("name", metadata.get("topic", "untitled"))),
            "chunk_index": metadata.get("chunk_index"),
            "source": metadata.get("source_file", metadata.get("source", "untitled")),
            "page_content": _doc.page_content,
            "similarity": _similarity
        }
        ctx = f"""{document['title']} page_content: {document['page_content']},  from source: {document['source']}.\n\n"""
        context += ctx
        results.append(document)
    return {
        "documents": results,
        "context": context
    }

def get_references(docs, threshold: float):
    results = []
    context = ""
    for doc in docs:
        _doc = doc[0]
        _similarity = 1 - doc[1]
        if _similarity < threshold:
            continue
        metadata = _doc.metadata
        document = {
            "title": metadata.get("title", metadata.get("name", metadata.get("topic", "untitled"))),
            "chunk_index": metadata.get("chunk_index"),
            "source": metadata.get("source_file", metadata.get("source", "untitled")),
            "page_content": _doc.page_content,
            "similarity": _similarity
        }
        ctx = f"""
        page_content: {document['page_content']},  from source: {document['source']}.
        """
        context += ctx
        results.append(document)
    return {
        "documents": results,
        "context": context
    }

def create_documents(
    chunks: List[str],
    filePath: Optional[Path] = None,
    built_in_metadata: Optional[dict] = {},
    title: Optional[str] = None
) -> List[Document]:
    """
    Create Document objects from text chunks with standard metadata (UUIDs, timestamps, indices).
    Works for both files (filePath provided) and raw text (filePath=None).
    """
    if filePath and filePath.exists():
        created_date = datetime.fromtimestamp(filePath.stat().st_ctime).isoformat()
        modified_date = datetime.fromtimestamp(filePath.stat().st_mtime).isoformat()
        source = filePath.name
        given_title = title or filePath.stem
    else:
        now = datetime.now().isoformat()
        created_date = now
        modified_date = now
        # Use existing source from metadata if available, else empty
        source = built_in_metadata.get("source", "")
        if not source and filePath:
             source = filePath.name
        given_title = title or built_in_metadata.get("title", "Untitled")

    docs = []
    for i, chunk in enumerate(chunks):
        # Base metadata
        metadata = {
            "doc_id": str(uuid.uuid4()),   # unique chunk id
            "source": source,
            "title": given_title,
            "created_date": created_date,
            "modified_date": modified_date,
            "chunk_index": i,
        }
        # Merge built-in, but don't overwrite our system fields if they exist
        # actually, built-in should probably take precedence for some things?
        # Let's simple merge:
        metadata.update(built_in_metadata)

        # Ensure our critical fields are set correctly after merge (if built-in had conflict)
        metadata["doc_id"] = metadata.get("doc_id", str(uuid.uuid4()))
        metadata["chunk_index"] = i

        doc = Document(page_content=chunk, metadata=metadata)
        docs.append(doc)
    return docs


def create_document(
    text: str,
    metadata: dict
):
    return Document(page_content=text, metadata=metadata)


def clean_metadata(metadata: dict):
    cleaned = {}
    for k, v in metadata.items():
        if isinstance(v, (datetime, date)):
            cleaned[k] = v.isoformat()
        elif isinstance(v, ALLOWED):
            cleaned[k] = v
        else:
            cleaned[k] = str(v)
    return cleaned

def read_text_file(filePath: Path):
    with open(filePath, "r", encoding="utf-8") as f:
        content = f.read()
    return content

def read_json_file(filePath: Path):
    with open(filePath, 'r') as file:
        data = json.load(file)
    return data

def build_metadata(filePath: Optional[Path] = None, content: Optional[str] = None):
    if filePath:
        content = read_text_file(filePath)

    parts = content.split("---", 2)

    if len(parts) >= 3:
        frontmatter = yaml.safe_load(parts[1]) or {}
        frontmatter = clean_metadata(frontmatter)

        # add file name as source always
        if filePath:
            frontmatter["source"] = filePath.name
        elif "source" not in frontmatter:
            frontmatter["source"] = ""

        return {
            "metadata": frontmatter,
            "content": parts[2].strip()
        }

    else:
        # Don't enforce empty source if not provided, allows external metadata to stick
        meta = {}
        if filePath:
            meta["source"] = filePath.name

        return {
            "metadata": meta,
            "content": content.strip()
        }

def create_documents_from_text(text: str, metadata: dict = {}):
    """
    Create documents from raw text with automatic splitting and metadata enrichment.
    """
    text = text.strip()
    data = build_metadata(content=text)

    # 1. Smart Metadata Merge
    final_metadata = data["metadata"].copy()

    # Update with provided metadata
    if final_metadata.get("source") == "" and metadata.get("source"):
        final_metadata["source"] = metadata["source"]

    # Merge regular keys
    final_metadata.update({k:v for k,v in metadata.items() if k != "source"})

    text = data["content"]

    # 2. Split text into chunks (strings)
    # Use section-aware splitter if text contains markdown section delimiters
    if "\n---\n" in text or text.startswith("---\n"):
        splitter = TextSplitter.for_markdown_with_sections()
    else:
        splitter = TextSplitter()
    chunks = splitter.split_text(text)

    # 3. Create documents using standard helper (adds IDs, indices, dates)
    return create_documents(
        chunks=chunks,
        filePath=None,
        built_in_metadata=final_metadata
    )

def load_json(filePath: Path):
    data = read_json_file(filePath=filePath)
    filePath = Path(filePath)
    file_name = filePath.name

    metadata = {
        "id": data["id"],
        "title": data.get("name", data.get("title", "Untitled")),
        "source": data["source"],
        "source_file": file_name or "Untitled",
        "created_date": datetime.now().isoformat()
    }

    docs= []
    splitter = TextSplitter()
    for key,value in data["content"].items():
        ctx = splitter.split_text(value.strip())
        for idx, chunk in enumerate(ctx):
            if(chunk.strip() == ""):
                continue
            else:
                chunk = f"{key}: {chunk.strip()}"
                docs.append(Document(page_content=chunk, metadata={**metadata, "topic": key, "chunk_index": idx}))
    return docs