File size: 10,261 Bytes
c88e290
38ce8e2
6695d4a
10e6f84
 
d52f43b
38ce8e2
b6c13b7
10e6f84
6695d4a
ff3310f
d469e88
10e6f84
6695d4a
10e6f84
ff3310f
10e6f84
6695d4a
 
10e6f84
 
 
 
 
 
 
38ce8e2
b6c13b7
 
 
10e6f84
b6c13b7
 
 
 
 
 
38ce8e2
10e6f84
 
 
 
 
 
 
 
 
 
 
 
 
 
c88e290
10e6f84
 
 
 
 
 
 
 
 
e0f2368
6695d4a
10e6f84
 
e0f2368
10e6f84
 
 
38ce8e2
10e6f84
 
 
 
 
38ce8e2
10e6f84
 
 
6695d4a
e5ea137
10e6f84
 
 
 
 
 
 
 
 
38ce8e2
10e6f84
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38ce8e2
10e6f84
 
 
 
 
 
 
 
 
38ce8e2
10e6f84
 
38ce8e2
9e30b0a
ff3310f
de87550
9e30b0a
 
10e6f84
9e30b0a
 
10e6f84
ff3310f
 
 
 
 
de87550
10e6f84
 
 
ff3310f
9e30b0a
ff3310f
9e30b0a
de87550
ff3310f
 
 
9e30b0a
 
38ce8e2
ff3310f
38ce8e2
ff3310f
38ce8e2
ff3310f
d469e88
 
 
 
9e30b0a
ff3310f
a1308bb
10e6f84
9e30b0a
 
a1308bb
 
9e30b0a
ff3310f
10e6f84
9e30b0a
 
 
10e6f84
ff3310f
9e30b0a
b6c13b7
38ce8e2
9e30b0a
38ce8e2
 
 
ff3310f
38ce8e2
10e6f84
 
 
 
ff3310f
10e6f84
 
 
 
 
 
38ce8e2
 
ff3310f
 
10e6f84
f6e4ae6
 
10e6f84
f6e4ae6
 
 
633b400
f6e4ae6
 
10e6f84
f6e4ae6
 
 
e4746b7
 
633b400
 
e4746b7
633b400
10e6f84
 
 
 
633b400
f6e4ae6
 
 
 
 
10e6f84
633b400
f6e4ae6
10e6f84
f6e4ae6
633b400
f6e4ae6
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
import os
import shutil
import logging
from typing import List, Tuple, Optional
from langchain_community.document_loaders import PyPDFLoader, TextLoader, UnstructuredWordDocumentLoader, UnstructuredPowerPointLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import Pinecone as LangchainPinecone
from langchain_core.documents import Document
from core.PineconeManager import PineconeManager
from core.AcronymManager import AcronymManager
from flashrank import Ranker, RerankRequest # NEW IMPORT

# CONFIGURATION
PINECONE_KEY = os.getenv("PINECONE_API_KEY")
UPLOAD_DIR = "source_documents"
logger = logging.getLogger(__name__)

# Initialize Reranker (Small, fast CPU model)
# Only initializes once when the app starts
try:
    reranker = Ranker(model_name="ms-marco-TinyBERT-L-2-v2", cache_dir="/tmp/flashrank_cache")
except Exception as e:
    logger.warning(f"Reranker failed to load: {e}")
    reranker = None

def get_embedding_func(model_name: str = "sentence-transformers/all-MiniLM-L6-v2"):
    try:
        if "openai" in model_name.lower():
            if not os.getenv("OPENAI_API_KEY"): raise ValueError("OpenAI API Key not found.")
            return OpenAIEmbeddings(model=model_name)
        else:
            return HuggingFaceEmbeddings(model_name=model_name)
    except Exception as e:
        logger.error(f"Failed to load embedding model '{model_name}': {e}")
        return HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

def save_uploaded_file(uploaded_file, username: str) -> str:
    user_dir = os.path.join(UPLOAD_DIR, username)
    os.makedirs(user_dir, exist_ok=True)
    file_path = os.path.join(user_dir, uploaded_file.name)
    with open(file_path, "wb") as f:
        f.write(uploaded_file.getbuffer())
    return file_path

class ParagraphChunker:
    def split_text(self, text):
        return [p.strip() for p in text.split('\n\n') if p.strip()]

def process_file(file_path: str, chunking_strategy: str = "paragraph") -> List[Document]:
    ext = os.path.splitext(file_path)[1].lower()
    try:
        if ext == ".pdf": loader = PyPDFLoader(file_path)
        elif ext == ".txt": loader = TextLoader(file_path, encoding='utf-8')
        elif ext == ".docx": loader = UnstructuredWordDocumentLoader(file_path)
        elif ext == ".pptx": loader = UnstructuredPowerPointLoader(file_path)
        elif ext == ".md": loader = TextLoader(file_path, encoding='utf-8')
        else: return []
        
        raw_docs = loader.load()
        text = "\n\n".join([d.page_content for d in raw_docs])
        
        if chunking_strategy == "token":
            splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
            chunks = splitter.create_documents([text])
        else:
            chunker = ParagraphChunker()
            texts = chunker.split_text(text)
            chunks = [Document(page_content=t) for t in texts]
            
        # Add metadata
        filename = os.path.basename(file_path)
        for doc in chunks:
            doc.metadata["source"] = filename
            doc.metadata["strategy"] = chunking_strategy
            
        return chunks
    except Exception as e:
        logger.error(f"Error processing {file_path}: {e}")
        return []

def search_knowledge_base(query: str, username: str, index_name: str, embed_model_name: str, k: int = 5, final_k: int = 5):
    """
    Searches Pinecone with Reranking.
    1. Fetches 3x candidates (Top 15).
    2. Reranks using TinyBERT.
    3. Returns Top 5.
    """
    if not PINECONE_KEY or not index_name: return []
    
    try:
        pm = PineconeManager(PINECONE_KEY)
        emb_fn = get_embedding_func(embed_model_name)
        vstore = pm.get_vectorstore(index_name, emb_fn, namespace=username)
        
        # 1. RETRIEVE BROAD (Fetch 3x what we need)
        broad_k = final_k * 3
        initial_docs = vstore.similarity_search(query, k=broad_k)
        
        if not initial_docs or not reranker: 
            return initial_docs[:final_k]
        
        # 2. RERANK (The Brain Upgrade)
        passages = [
            {"id": str(i), "text": doc.page_content, "meta": doc.metadata} 
            for i, doc in enumerate(initial_docs)
        ]
        
        rerank_request = RerankRequest(query=query, passages=passages)
        ranked_results = reranker.rerank(rerank_request)
        
        # 3. SELECT TOP K
        final_docs = []
        for res in ranked_results[:final_k]:
            meta = res.get("meta", {})
            meta["rerank_score"] = res.get("score") # Useful for debugging
            final_docs.append(Document(page_content=res["text"], metadata=meta))
            
        return final_docs

    except Exception as e:
        logger.error(f"Search failed: {e}")
        return []

def process_and_add_text(text: str, source_name: str, username: str, index_name: str) -> Tuple[bool, str]:
    if not PINECONE_KEY or not index_name: return False, "Pinecone Configuration Missing."
    try:
        pm = PineconeManager(PINECONE_KEY)
        
        # 1. PRE-EMPTIVE DELETE
        pm.delete_file(index_name, source_name, namespace=username)
        
        # 2. SAVE BACKUP
        user_docs_dir = os.path.join(UPLOAD_DIR, username)
        os.makedirs(user_docs_dir, exist_ok=True)
        backup_path = os.path.join(user_docs_dir, source_name)
        with open(backup_path, "w", encoding='utf-8') as f:
            f.write(text)

        # 3. UPLOAD
        emb_fn = get_embedding_func()
        doc = Document(page_content=text, metadata={"source": source_name, "strategy": "flattened", "file_type": "generated"})
        vstore = pm.get_vectorstore(index_name, emb_fn, namespace=username)
        vstore.add_documents([doc], ids=[f"{source_name}_0"])
        
        return True, f"Updated: {source_name}"
    except Exception as e:
        logger.error(f"Error indexing text: {e}")
        return False, str(e)

def ingest_file(file_path: str, username: str, index_name: str, embed_model_name: str = None, strategy: str = "paragraph") -> Tuple[bool, str]:
    if not PINECONE_KEY or not index_name: return False, "Pinecone Configuration Missing."
    try:
        # 1. Chunking
        docs = process_file(file_path, chunking_strategy=strategy)
        if not docs: return False, "No valid chunks generated."

        # 2. Acronym Learning
        acronym_mgr = AcronymManager()
        for doc in docs:
            acronym_mgr.scan_text_for_acronyms(doc.page_content)

        # 3. Pinecone Manager
        pm = PineconeManager(PINECONE_KEY)
        
        # 4. SAFETY CHECK
        emb_fn = get_embedding_func(embed_model_name)
        test_vec = emb_fn.embed_query("test")
        model_dim = len(test_vec)
        if not pm.check_dimension_compatibility(index_name, model_dim):
            return False, f"Dimension Mismatch! Index '{index_name}' expects {model_dim}d vectors."

        # 5. PRE-EMPTIVE DELETE
        filename = os.path.basename(file_path)
        pm.delete_file(index_name, filename, namespace=username)

        # 6. UPLOAD
        vstore = pm.get_vectorstore(index_name, emb_fn, namespace=username)
        custom_ids = [f"{doc.metadata.get('source', filename)}_{i}" for i, doc in enumerate(docs)]
        vstore.add_documents(docs, ids=custom_ids)
        
        return True, f"Successfully updated {filename} ({len(docs)} chunks)."

    except Exception as e:
        logger.error(f"Ingestion failed: {e}")
        return False, str(e)

def delete_document(username: str, filename: str, index_name: str):
    user_dir = os.path.join(UPLOAD_DIR, username)
    file_path = os.path.join(user_dir, filename)
    if os.path.exists(file_path): os.remove(file_path)
    
    if PINECONE_KEY and index_name:
        try:
            pm = PineconeManager(PINECONE_KEY)
            pm.delete_file(index_name, filename, namespace=username)
        except Exception as e:
            logger.error(f"Pinecone delete failed: {e}")

def list_documents(username: str) -> List[dict]:
    user_dir = os.path.join(UPLOAD_DIR, username)
    if not os.path.exists(user_dir): return []
    return [{"filename": f, "source": f} for f in os.listdir(user_dir) if f.lower().endswith(('.txt', '.md', '.pdf', '.docx'))]

def rebuild_cache_from_pinecone(username: str, index_name: str) -> Tuple[bool, str]:
    if not PINECONE_KEY or not index_name: return False, "Pinecone config missing."
    try:
        pm = PineconeManager(PINECONE_KEY)
        ids = pm.get_all_ids(index_name, username)
        if not ids: return False, "No data found in Pinecone."
            
        batch_size = 100
        reconstructed_files = {} 
        for i in range(0, len(ids), batch_size):
            batch_ids = ids[i : i + batch_size]
            response = pm.fetch_vectors(index_name, batch_ids, username)
            vectors = response.vectors 
            for vec_id, vec_data in vectors.items():
                meta = vec_data.metadata or {}
                source = meta.get('source', 'unknown.txt')
                text = meta.get('text') or meta.get('page_content') or ''
                try:
                    if "_" in vec_id: chunk_index = int(vec_id.rsplit('_', 1)[-1])
                    else: chunk_index = 0
                except ValueError: chunk_index = 0
                if source not in reconstructed_files: reconstructed_files[source] = []
                reconstructed_files[source].append((chunk_index, text))
        
        user_dir = os.path.join(UPLOAD_DIR, username)
        os.makedirs(user_dir, exist_ok=True)
        count = 0
        for filename, chunks in reconstructed_files.items():
            chunks.sort(key=lambda x: x[0]) # SORTING FIX
            full_text = "\n\n".join([c[1] for c in chunks])
            file_path = os.path.join(user_dir, filename)
            with open(file_path, "w", encoding="utf-8") as f: f.write(full_text)
            count += 1
        return True, f"Restored {count} files (Sorted) from Pinecone!"
    except Exception as e:
        logger.error(f"Cache rebuild failed: {e}")
        return False, str(e)