File size: 10,725 Bytes
c88e290
38ce8e2
6695d4a
38ce8e2
6695d4a
38ce8e2
 
 
6695d4a
690eea6
38ce8e2
c88e290
38ce8e2
6695d4a
 
 
38ce8e2
 
 
 
 
 
6695d4a
 
 
 
38ce8e2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6695d4a
38ce8e2
c88e290
6695d4a
 
 
 
 
 
 
e0f2368
 
6695d4a
 
c88e290
6695d4a
 
 
 
 
e0f2368
6695d4a
38ce8e2
6695d4a
38ce8e2
c88e290
e0f2368
7841205
6695d4a
7841205
c88e290
6695d4a
 
 
 
 
38ce8e2
6695d4a
 
38ce8e2
6695d4a
 
 
c88e290
 
6695d4a
38ce8e2
 
6695d4a
38ce8e2
6695d4a
 
 
38ce8e2
6695d4a
 
 
 
 
 
 
e0f2368
6695d4a
 
 
 
38ce8e2
6695d4a
38ce8e2
 
 
 
 
 
 
 
 
6695d4a
38ce8e2
6695d4a
 
 
 
e5ea137
38ce8e2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
de87550
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38ce8e2
99043ee
38ce8e2
 
99043ee
38ce8e2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
99043ee
 
38ce8e2
 
 
 
 
 
 
 
cdc0512
38ce8e2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cdc0512
38ce8e2
 
cdc0512
38ce8e2
 
 
cdc0512
38ce8e2
 
 
cdc0512
38ce8e2
 
 
cdc0512
38ce8e2
 
 
 
 
 
 
 
 
 
cdc0512
6cd615b
38ce8e2
6cd615b
38ce8e2
 
 
 
 
 
 
 
 
 
 
 
cdc0512
38ce8e2
 
 
 
 
 
 
 
 
 
 
 
cdc0512
38ce8e2
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
import os
import shutil
import logging
from typing import List, Literal, Tuple

# --- LANGCHAIN & DB IMPORTS ---
from langchain_chroma import Chroma
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_core.documents import Document
from langchain_text_splitters import MarkdownHeaderTextSplitter, RecursiveCharacterTextSplitter
from sentence_transformers import CrossEncoder

# --- CUSTOM CORE IMPORTS ---
from core.ParagraphChunker import ParagraphChunker
from core.TokenChunker import TokenChunker

# --- CONFIGURATION ---
CHROMA_PATH = "chroma_db"
UPLOAD_DIR = "source_documents"
EMBED_MODEL_NAME = "sentence-transformers/all-mpnet-base-v2"
RERANK_MODEL_NAME = "cross-encoder/ms-marco-MiniLM-L-6-v2"

# Configure Logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# --- LAZY LOADING GLOBALS ---
_embedding_func = None
_rerank_model = None

def get_embedding_func():
    """Lazy loads the embedding model to save startup resources."""
    global _embedding_func
    if _embedding_func is None:
        logger.info(f"⏳ Loading Embedding Model: {EMBED_MODEL_NAME}...")
        _embedding_func = HuggingFaceEmbeddings(model_name=EMBED_MODEL_NAME)
        logger.info("✅ Embedding Model Loaded.")
    return _embedding_func

def get_rerank_model():
    """Lazy loads the Cross-Encoder model."""
    global _rerank_model
    if _rerank_model is None:
        logger.info(f"⏳ Loading Reranker: {RERANK_MODEL_NAME}...")
        _rerank_model = CrossEncoder(RERANK_MODEL_NAME)
        logger.info("✅ Reranker Loaded.")
    return _rerank_model

# --- PART 1: CHUNKING LOGIC (The New System) ---

def _process_markdown(file_path: str, chunk_size: int = 1000, chunk_overlap: int = 100) -> List[Document]:
    """Internal helper to process Markdown files using Header Semantic Splitting."""
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            markdown_text = f.read()

        headers_to_split_on = [
            ("#", "Header 1"),
            ("##", "Header 2"),
            ("###", "Header 3"),
        ]

        markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)
        md_header_splits = markdown_splitter.split_text(markdown_text)

        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=chunk_size, 
            chunk_overlap=chunk_overlap
        )
        final_docs = text_splitter.split_documents(md_header_splits)
        
        for doc in final_docs:
            doc.metadata['source'] = os.path.basename(file_path)
            doc.metadata['file_type'] = 'md'
            doc.metadata['strategy'] = 'markdown_header'

        return final_docs
    except Exception as e:
        logger.error(f"Error processing Markdown file {file_path}: {e}")
        return []

def process_file(
    file_path: str, 
    chunking_strategy: Literal["paragraph", "token"] = "paragraph",
    chunk_size: int = 512,
    chunk_overlap: int = 50,
    model_name: str = "gpt-4"
) -> List[Document]:
    """
    Main chunking engine. Routes file to specific chunkers based on type/strategy.
    """
    if not os.path.exists(file_path):
        logger.error(f"File not found: {file_path}")
        return []

    file_extension = os.path.splitext(file_path)[1].lower()
    file_name = os.path.basename(file_path)
    logger.info(f"Processing {file_name} using strategy: {chunking_strategy}")

    # 1. Handle Markdown
    if file_extension == ".md":
        return _process_markdown(file_path, chunk_size, chunk_overlap)

    # 2. Handle PDF and TXT
    elif file_extension in [".pdf", ".txt"]:
        if chunking_strategy == "token":
            chunker = TokenChunker(
                model_name=model_name,
                chunk_size=chunk_size,
                chunk_overlap=chunk_overlap
            )
        else:
            chunker = ParagraphChunker(model_name=model_name)

        try:
            if file_extension == ".pdf":
                docs = chunker.process_document(file_path)
            elif file_extension == ".txt":
                docs = chunker.process_text_file(file_path)
            
            # Ensure metadata consistency
            for doc in docs:
                doc.metadata["source"] = file_name
                doc.metadata["strategy"] = chunking_strategy
            
            return docs
            
        except Exception as e:
            logger.error(f"Error using {chunking_strategy} chunker on {file_name}: {e}")
            return []
    else:
        logger.warning(f"Unsupported file extension: {file_extension}")
        return []

# --- PART 2: DATABASE & FILE MANAGEMENT (The Old Stable System) ---

def save_uploaded_file(uploaded_file, username: str = "default") -> str:
    """Saves a StreamlitUploadedFile to disk so the loaders can read it."""
    try:
        user_dir = os.path.join(UPLOAD_DIR, username)
        os.makedirs(user_dir, exist_ok=True)
        file_path = os.path.join(user_dir, uploaded_file.name)
        
        with open(file_path, "wb") as f:
            f.write(uploaded_file.getbuffer())
            
        logger.info(f"File saved: {file_path}")
        return file_path
    except Exception as e:
        logger.error(f"Error saving file: {e}")
        return None

def process_and_add_text(text: str, source_name: str, username: str) -> Tuple[bool, str]:
    """
    Ingests raw text string (e.g., from the Flattener tool) directly into Chroma.
    """
    try:
        user_db_path = os.path.join(CHROMA_PATH, username)
        emb_fn = get_embedding_func()
        
        # Initialize DB
        db = Chroma(persist_directory=user_db_path, embedding_function=emb_fn)

        # Create a Document object directly
        doc = Document(
            page_content=text,
            metadata={
                "source": source_name, 
                "strategy": "flattened_text",
                "file_type": "generated"
            }
        )

        # Add single document
        db.add_documents([doc])
        return True, f"Successfully indexed flattened text: {source_name}"
        
    except Exception as e:
        logger.error(f"Error indexing raw text: {e}")
        return False, f"Error: {str(e)}"

def ingest_file(file_path: str, username: str, strategy: str = "paragraph") -> Tuple[bool, str]:
    """
    The High-Level Bridge: Takes a file path, chunks it, and saves to Vector DB.
    Replaces the old 'process_and_add_document'.
    """
    try:
        # 1. Chunk the file using the new engine
        docs = process_file(file_path, chunking_strategy=strategy)
        
        if not docs:
            return False, "No valid chunks generated from file."

        # 2. Add to Chroma DB
        user_db_path = os.path.join(CHROMA_PATH, username)
        emb_fn = get_embedding_func()
        
        db = Chroma(persist_directory=user_db_path, embedding_function=emb_fn)
        db.add_documents(docs)
        
        return True, f"Successfully indexed {len(docs)} chunks."

    except Exception as e:
        logger.error(f"Ingestion failed: {e}")
        return False, f"System Error: {str(e)}"

def search_knowledge_base(query: str, username: str, k: int = 10, final_k: int = 4) -> List[Document]:
    """Retrieves top K chunks, then uses Cross-Encoder to re-rank them."""
    user_db_path = os.path.join(CHROMA_PATH, username)
    if not os.path.exists(user_db_path):
        return []
        
    try:
        # 1. Vector Retrieval
        emb_fn = get_embedding_func()
        db = Chroma(persist_directory=user_db_path, embedding_function=emb_fn)
        results = db.similarity_search_with_relevance_scores(query, k=k)
        
        if not results:
            return []

        # 2. Reranking
        candidate_docs = [doc for doc, _ in results]
        candidate_texts = [doc.page_content for doc in candidate_docs]
        pairs = [[query, text] for text in candidate_texts]
        
        reranker = get_rerank_model()
        scores = reranker.predict(pairs)
        
        # Sort by new score
        scored_docs = list(zip(candidate_docs, scores))
        scored_docs.sort(key=lambda x: x[1], reverse=True)
        
        return [doc for doc, score in scored_docs[:final_k]]

    except Exception as e:
        logger.error(f"Search Error: {e}")
        return []

def list_documents(username: str) -> List[dict]:
    """
    Returns a list of unique files currently in the vector database.
    (Used for the sidebar list)
    """
    user_db_path = os.path.join(CHROMA_PATH, username)
    if not os.path.exists(user_db_path):
        return []
        
    try:
        emb_fn = get_embedding_func()
        db = Chroma(persist_directory=user_db_path, embedding_function=emb_fn)
        
        # Chroma's .get() returns all metadata
        data = db.get()
        metadatas = data['metadatas']
        
        inventory = {}
        for m in metadatas:
            # Metadata keys might differ slightly, handle gracefully
            src = m.get('source', 'Unknown')
            if src not in inventory:
                inventory[src] = {
                    "chunks": 0, 
                    "strategy": m.get('strategy', 'unknown')
                }
            inventory[src]["chunks"] += 1
            
        # FIXED: Added "source": k to the dictionary below
        return [
            {"filename": k, "chunks": v["chunks"], "strategy": v["strategy"], "source": k} 
            for k, v in inventory.items()
        ]
    except Exception as e:
        logger.error(f"Error listing docs: {e}")
        return []

def delete_document(username: str, filename: str) -> Tuple[bool, str]:
    """Removes a document from the vector database."""
    user_db_path = os.path.join(CHROMA_PATH, username)
    try:
        emb_fn = get_embedding_func()
        db = Chroma(persist_directory=user_db_path, embedding_function=emb_fn)
        
        data = db.get()
        ids_to_delete = []
        for i, meta in enumerate(data['metadatas']):
            if meta.get('source') == filename:
                ids_to_delete.append(data['ids'][i])
                
        if ids_to_delete:
            db.delete(ids=ids_to_delete)
            return True, f"Deleted {filename}."
        else:
            return False, "File not found in index."
            
    except Exception as e:
        return False, f"Delete failed: {e}"

def reset_knowledge_base(username: str) -> Tuple[bool, str]:
    """Nukes the user's database folder."""
    user_db_path = os.path.join(CHROMA_PATH, username)
    if os.path.exists(user_db_path):
        shutil.rmtree(user_db_path)
        return True, "Database Reset."
    return False, "Database already empty."