Spaces:
Sleeping
Sleeping
File size: 10,261 Bytes
c88e290 38ce8e2 6695d4a 10e6f84 d52f43b 38ce8e2 b6c13b7 10e6f84 6695d4a ff3310f d469e88 10e6f84 6695d4a 10e6f84 ff3310f 10e6f84 6695d4a 10e6f84 38ce8e2 b6c13b7 10e6f84 b6c13b7 38ce8e2 10e6f84 c88e290 10e6f84 e0f2368 6695d4a 10e6f84 e0f2368 10e6f84 38ce8e2 10e6f84 38ce8e2 10e6f84 6695d4a e5ea137 10e6f84 38ce8e2 10e6f84 38ce8e2 10e6f84 38ce8e2 10e6f84 38ce8e2 9e30b0a ff3310f de87550 9e30b0a 10e6f84 9e30b0a 10e6f84 ff3310f de87550 10e6f84 ff3310f 9e30b0a ff3310f 9e30b0a de87550 ff3310f 9e30b0a 38ce8e2 ff3310f 38ce8e2 ff3310f 38ce8e2 ff3310f d469e88 9e30b0a ff3310f a1308bb 10e6f84 9e30b0a a1308bb 9e30b0a ff3310f 10e6f84 9e30b0a 10e6f84 ff3310f 9e30b0a b6c13b7 38ce8e2 9e30b0a 38ce8e2 ff3310f 38ce8e2 10e6f84 ff3310f 10e6f84 38ce8e2 ff3310f 10e6f84 f6e4ae6 10e6f84 f6e4ae6 633b400 f6e4ae6 10e6f84 f6e4ae6 e4746b7 633b400 e4746b7 633b400 10e6f84 633b400 f6e4ae6 10e6f84 633b400 f6e4ae6 10e6f84 f6e4ae6 633b400 f6e4ae6 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 | import os
import shutil
import logging
from typing import List, Tuple, Optional
from langchain_community.document_loaders import PyPDFLoader, TextLoader, UnstructuredWordDocumentLoader, UnstructuredPowerPointLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import Pinecone as LangchainPinecone
from langchain_core.documents import Document
from core.PineconeManager import PineconeManager
from core.AcronymManager import AcronymManager
from flashrank import Ranker, RerankRequest # NEW IMPORT
# CONFIGURATION
PINECONE_KEY = os.getenv("PINECONE_API_KEY")
UPLOAD_DIR = "source_documents"
logger = logging.getLogger(__name__)
# Initialize Reranker (Small, fast CPU model)
# Only initializes once when the app starts
try:
reranker = Ranker(model_name="ms-marco-TinyBERT-L-2-v2", cache_dir="/tmp/flashrank_cache")
except Exception as e:
logger.warning(f"Reranker failed to load: {e}")
reranker = None
def get_embedding_func(model_name: str = "sentence-transformers/all-MiniLM-L6-v2"):
try:
if "openai" in model_name.lower():
if not os.getenv("OPENAI_API_KEY"): raise ValueError("OpenAI API Key not found.")
return OpenAIEmbeddings(model=model_name)
else:
return HuggingFaceEmbeddings(model_name=model_name)
except Exception as e:
logger.error(f"Failed to load embedding model '{model_name}': {e}")
return HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
def save_uploaded_file(uploaded_file, username: str) -> str:
user_dir = os.path.join(UPLOAD_DIR, username)
os.makedirs(user_dir, exist_ok=True)
file_path = os.path.join(user_dir, uploaded_file.name)
with open(file_path, "wb") as f:
f.write(uploaded_file.getbuffer())
return file_path
class ParagraphChunker:
def split_text(self, text):
return [p.strip() for p in text.split('\n\n') if p.strip()]
def process_file(file_path: str, chunking_strategy: str = "paragraph") -> List[Document]:
ext = os.path.splitext(file_path)[1].lower()
try:
if ext == ".pdf": loader = PyPDFLoader(file_path)
elif ext == ".txt": loader = TextLoader(file_path, encoding='utf-8')
elif ext == ".docx": loader = UnstructuredWordDocumentLoader(file_path)
elif ext == ".pptx": loader = UnstructuredPowerPointLoader(file_path)
elif ext == ".md": loader = TextLoader(file_path, encoding='utf-8')
else: return []
raw_docs = loader.load()
text = "\n\n".join([d.page_content for d in raw_docs])
if chunking_strategy == "token":
splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
chunks = splitter.create_documents([text])
else:
chunker = ParagraphChunker()
texts = chunker.split_text(text)
chunks = [Document(page_content=t) for t in texts]
# Add metadata
filename = os.path.basename(file_path)
for doc in chunks:
doc.metadata["source"] = filename
doc.metadata["strategy"] = chunking_strategy
return chunks
except Exception as e:
logger.error(f"Error processing {file_path}: {e}")
return []
def search_knowledge_base(query: str, username: str, index_name: str, embed_model_name: str, k: int = 5, final_k: int = 5):
"""
Searches Pinecone with Reranking.
1. Fetches 3x candidates (Top 15).
2. Reranks using TinyBERT.
3. Returns Top 5.
"""
if not PINECONE_KEY or not index_name: return []
try:
pm = PineconeManager(PINECONE_KEY)
emb_fn = get_embedding_func(embed_model_name)
vstore = pm.get_vectorstore(index_name, emb_fn, namespace=username)
# 1. RETRIEVE BROAD (Fetch 3x what we need)
broad_k = final_k * 3
initial_docs = vstore.similarity_search(query, k=broad_k)
if not initial_docs or not reranker:
return initial_docs[:final_k]
# 2. RERANK (The Brain Upgrade)
passages = [
{"id": str(i), "text": doc.page_content, "meta": doc.metadata}
for i, doc in enumerate(initial_docs)
]
rerank_request = RerankRequest(query=query, passages=passages)
ranked_results = reranker.rerank(rerank_request)
# 3. SELECT TOP K
final_docs = []
for res in ranked_results[:final_k]:
meta = res.get("meta", {})
meta["rerank_score"] = res.get("score") # Useful for debugging
final_docs.append(Document(page_content=res["text"], metadata=meta))
return final_docs
except Exception as e:
logger.error(f"Search failed: {e}")
return []
def process_and_add_text(text: str, source_name: str, username: str, index_name: str) -> Tuple[bool, str]:
if not PINECONE_KEY or not index_name: return False, "Pinecone Configuration Missing."
try:
pm = PineconeManager(PINECONE_KEY)
# 1. PRE-EMPTIVE DELETE
pm.delete_file(index_name, source_name, namespace=username)
# 2. SAVE BACKUP
user_docs_dir = os.path.join(UPLOAD_DIR, username)
os.makedirs(user_docs_dir, exist_ok=True)
backup_path = os.path.join(user_docs_dir, source_name)
with open(backup_path, "w", encoding='utf-8') as f:
f.write(text)
# 3. UPLOAD
emb_fn = get_embedding_func()
doc = Document(page_content=text, metadata={"source": source_name, "strategy": "flattened", "file_type": "generated"})
vstore = pm.get_vectorstore(index_name, emb_fn, namespace=username)
vstore.add_documents([doc], ids=[f"{source_name}_0"])
return True, f"Updated: {source_name}"
except Exception as e:
logger.error(f"Error indexing text: {e}")
return False, str(e)
def ingest_file(file_path: str, username: str, index_name: str, embed_model_name: str = None, strategy: str = "paragraph") -> Tuple[bool, str]:
if not PINECONE_KEY or not index_name: return False, "Pinecone Configuration Missing."
try:
# 1. Chunking
docs = process_file(file_path, chunking_strategy=strategy)
if not docs: return False, "No valid chunks generated."
# 2. Acronym Learning
acronym_mgr = AcronymManager()
for doc in docs:
acronym_mgr.scan_text_for_acronyms(doc.page_content)
# 3. Pinecone Manager
pm = PineconeManager(PINECONE_KEY)
# 4. SAFETY CHECK
emb_fn = get_embedding_func(embed_model_name)
test_vec = emb_fn.embed_query("test")
model_dim = len(test_vec)
if not pm.check_dimension_compatibility(index_name, model_dim):
return False, f"Dimension Mismatch! Index '{index_name}' expects {model_dim}d vectors."
# 5. PRE-EMPTIVE DELETE
filename = os.path.basename(file_path)
pm.delete_file(index_name, filename, namespace=username)
# 6. UPLOAD
vstore = pm.get_vectorstore(index_name, emb_fn, namespace=username)
custom_ids = [f"{doc.metadata.get('source', filename)}_{i}" for i, doc in enumerate(docs)]
vstore.add_documents(docs, ids=custom_ids)
return True, f"Successfully updated {filename} ({len(docs)} chunks)."
except Exception as e:
logger.error(f"Ingestion failed: {e}")
return False, str(e)
def delete_document(username: str, filename: str, index_name: str):
user_dir = os.path.join(UPLOAD_DIR, username)
file_path = os.path.join(user_dir, filename)
if os.path.exists(file_path): os.remove(file_path)
if PINECONE_KEY and index_name:
try:
pm = PineconeManager(PINECONE_KEY)
pm.delete_file(index_name, filename, namespace=username)
except Exception as e:
logger.error(f"Pinecone delete failed: {e}")
def list_documents(username: str) -> List[dict]:
user_dir = os.path.join(UPLOAD_DIR, username)
if not os.path.exists(user_dir): return []
return [{"filename": f, "source": f} for f in os.listdir(user_dir) if f.lower().endswith(('.txt', '.md', '.pdf', '.docx'))]
def rebuild_cache_from_pinecone(username: str, index_name: str) -> Tuple[bool, str]:
if not PINECONE_KEY or not index_name: return False, "Pinecone config missing."
try:
pm = PineconeManager(PINECONE_KEY)
ids = pm.get_all_ids(index_name, username)
if not ids: return False, "No data found in Pinecone."
batch_size = 100
reconstructed_files = {}
for i in range(0, len(ids), batch_size):
batch_ids = ids[i : i + batch_size]
response = pm.fetch_vectors(index_name, batch_ids, username)
vectors = response.vectors
for vec_id, vec_data in vectors.items():
meta = vec_data.metadata or {}
source = meta.get('source', 'unknown.txt')
text = meta.get('text') or meta.get('page_content') or ''
try:
if "_" in vec_id: chunk_index = int(vec_id.rsplit('_', 1)[-1])
else: chunk_index = 0
except ValueError: chunk_index = 0
if source not in reconstructed_files: reconstructed_files[source] = []
reconstructed_files[source].append((chunk_index, text))
user_dir = os.path.join(UPLOAD_DIR, username)
os.makedirs(user_dir, exist_ok=True)
count = 0
for filename, chunks in reconstructed_files.items():
chunks.sort(key=lambda x: x[0]) # SORTING FIX
full_text = "\n\n".join([c[1] for c in chunks])
file_path = os.path.join(user_dir, filename)
with open(file_path, "w", encoding="utf-8") as f: f.write(full_text)
count += 1
return True, f"Restored {count} files (Sorted) from Pinecone!"
except Exception as e:
logger.error(f"Cache rebuild failed: {e}")
return False, str(e) |