Spaces:
Sleeping
Sleeping
File size: 11,388 Bytes
c88e290 38ce8e2 6695d4a 10e6f84 275f2bc 10f62d5 38ce8e2 b6c13b7 6695d4a 10f62d5 15383a5 ff3310f d469e88 15383a5 10f62d5 6695d4a 10e6f84 ff3310f 10e6f84 6695d4a 15383a5 10e6f84 38ce8e2 b6c13b7 275f2bc 10e6f84 b6c13b7 275f2bc 10f62d5 2b08f17 275f2bc 10a9f86 275f2bc b6c13b7 10a9f86 275f2bc b6c13b7 10a9f86 38ce8e2 10e6f84 10f62d5 15383a5 c88e290 10f62d5 15383a5 e0f2368 10f62d5 38ce8e2 10e6f84 6695d4a e5ea137 10f62d5 10e6f84 38ce8e2 15383a5 10f62d5 15383a5 10f62d5 15383a5 10e6f84 10f62d5 15383a5 10e6f84 10f62d5 15383a5 10f62d5 15383a5 10e6f84 15383a5 10f62d5 10e6f84 15383a5 10e6f84 38ce8e2 10f62d5 38ce8e2 73ee2f4 ff3310f 73ee2f4 de87550 9e30b0a 15383a5 9e30b0a 15383a5 9e30b0a 15383a5 ff3310f 15383a5 73ee2f4 ff3310f de87550 15383a5 10f62d5 73ee2f4 15383a5 10f62d5 15383a5 10f62d5 15383a5 10f62d5 ff3310f 15383a5 10f62d5 15383a5 10f62d5 de87550 ff3310f 10f62d5 38ce8e2 ff3310f 9e30b0a ff3310f 38ce8e2 10f62d5 15383a5 10f62d5 38ce8e2 10f62d5 38ce8e2 10e6f84 38ce8e2 ff3310f 10e6f84 f6e4ae6 10e6f84 f6e4ae6 633b400 f6e4ae6 10f62d5 15383a5 10f62d5 f6e4ae6 10e6f84 f6e4ae6 e4746b7 633b400 15383a5 e4746b7 633b400 10e6f84 633b400 f6e4ae6 10f62d5 633b400 f6e4ae6 10e6f84 f6e4ae6 15383a5 f6e4ae6 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 | import os
import shutil
import logging
from typing import List, Tuple, Optional
from huggingface_hub import snapshot_download
# LangChain Imports
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_openai import OpenAIEmbeddings
from langchain_core.documents import Document
# Internal Core Imports
from core.PineconeManager import PineconeManager
from core.AcronymManager import AcronymManager
from core.ChunkingManager import ChunkingManager
from flashrank import Ranker, RerankRequest
# CONFIGURATION
PINECONE_KEY = os.getenv("PINECONE_API_KEY")
UPLOAD_DIR = "source_documents"
logger = logging.getLogger(__name__)
# Initialize Reranker
try:
reranker = Ranker(model_name="ms-marco-TinyBERT-L-2-v2", cache_dir="/tmp/flashrank_cache")
except Exception as e:
logger.warning(f"Reranker failed to load: {e}")
reranker = None
def get_embedding_func(model_name: str = "sentence-transformers/all-MiniLM-L6-v2"):
try:
# CHECK 1: OpenAI
if "openai" in model_name.lower() or "text-embedding" in model_name.lower():
if not os.getenv("OPENAI_API_KEY"): raise ValueError("OpenAI API Key not found.")
return OpenAIEmbeddings(model=model_name)
# CHECK 2: YOUR CUSTOM FINE-TUNE
elif "navy-custom-models" in model_name:
logger.info(f"Downloading custom model from: {model_name}")
parts = model_name.split("/")
repo_id = f"{parts[0]}/{parts[1]}"
folder_name = parts[2]
storage_path = snapshot_download(
repo_id=repo_id,
repo_type="model",
allow_patterns=f"{folder_name}/*"
)
local_model_path = os.path.join(storage_path, folder_name)
# FIX: Explicitly set device to CPU to avoid meta-tensor errors
return HuggingFaceEmbeddings(
model_name=local_model_path,
model_kwargs={'device': 'cpu'}
)
# CHECK 3: Standard Public Models
else:
# FIX: Explicitly set device to CPU here as well
return HuggingFaceEmbeddings(
model_name=model_name,
model_kwargs={'device': 'cpu'}
)
except Exception as e:
logger.error(f"Failed to load embedding model '{model_name}': {e}")
return HuggingFaceEmbeddings(
model_name="sentence-transformers/all-MiniLM-L6-v2",
model_kwargs={'device': 'cpu'}
)
def save_uploaded_file(uploaded_file, username: str) -> str:
user_dir = os.path.join(UPLOAD_DIR, username)
os.makedirs(user_dir, exist_ok=True)
file_path = os.path.join(user_dir, uploaded_file.name)
with open(file_path, "wb") as f:
f.write(uploaded_file.getbuffer())
return file_path
def process_file(file_path: str, chunking_strategy: str = "paragraph", embed_model_name: str = "all-mpnet-base-v2") -> List[Document]:
"""Delegates to ChunkingManager."""
try:
logger.info(f"Initializing ChunkingManager for {file_path} using {chunking_strategy}")
manager = ChunkingManager(embedding_model_name=embed_model_name)
chunks = manager.process_document(file_path=file_path, strategy=chunking_strategy, preprocess=True)
if isinstance(chunks, dict):
flat_chunks = []
for key, val in chunks.items():
if isinstance(val, list): flat_chunks.extend(val)
return flat_chunks
return chunks
except Exception as e:
logger.error(f"Error processing {file_path}: {e}")
return []
def ingest_file(file_path: str, username: str, index_name: str, embed_model_name: str = "sentence-transformers/all-MiniLM-L6-v2", strategy: str = "paragraph") -> Tuple[bool, str]:
if not PINECONE_KEY or not index_name: return False, "Pinecone Configuration Missing."
try:
# 1. Chunking
docs = process_file(file_path, chunking_strategy=strategy, embed_model_name=embed_model_name)
if not docs: return False, "No valid chunks generated."
# 2. METADATA SANITIZATION (The Fix for Pinecone IDs)
# We enforce that 'source' is just the filename, stripping the path.
clean_filename = os.path.basename(file_path)
for doc in docs:
doc.metadata["source"] = clean_filename
# Remove any absolute paths that might have leaked into metadata
if "file_path" in doc.metadata: del doc.metadata["file_path"]
# 3. Acronym Learning
acronym_mgr = AcronymManager()
for doc in docs:
acronym_mgr.scan_text_for_acronyms(doc.page_content)
# 4. Pinecone Manager
pm = PineconeManager(PINECONE_KEY)
# 5. SAFETY CHECK
emb_fn = get_embedding_func(embed_model_name)
test_vec = emb_fn.embed_query("test")
model_dim = len(test_vec)
if not pm.check_dimension_compatibility(index_name, model_dim):
return False, f"Dimension Mismatch! Index '{index_name}' expects {model_dim}d vectors."
# 6. PRE-EMPTIVE DELETE
pm.delete_file(index_name, clean_filename, namespace=username)
# 7. UPLOAD
vstore = pm.get_vectorstore(index_name, emb_fn, namespace=username)
# Now IDs will be "filename.txt_0", "filename.txt_1" etc.
custom_ids = [f"{clean_filename}_{i}" for i, doc in enumerate(docs)]
vstore.add_documents(docs, ids=custom_ids)
return True, f"Successfully updated {clean_filename} ({len(docs)} chunks)."
except Exception as e:
logger.error(f"Ingestion failed: {e}")
return False, str(e)
def process_and_add_text(text: str, source_name: str, username: str, index_name: str, embed_model_name: str = None) -> Tuple[bool, str]:
if not PINECONE_KEY or not index_name: return False, "Pinecone Configuration Missing."
try:
pm = PineconeManager(PINECONE_KEY)
clean_source = os.path.basename(source_name)
# 1. DELETE OLD
pm.delete_file(index_name, clean_source, namespace=username)
# 2. BACKUP
user_docs_dir = os.path.join(UPLOAD_DIR, username)
os.makedirs(user_docs_dir, exist_ok=True)
backup_path = os.path.join(user_docs_dir, clean_source)
with open(backup_path, "w", encoding='utf-8') as f:
f.write(text)
# 3. CHUNK
manager = ChunkingManager(embedding_model_name=embed_model_name)
docs = manager.process_document(backup_path, strategy="paragraph", preprocess=True)
# 4. SANITIZE METADATA
for doc in docs:
doc.metadata["source"] = clean_source
doc.metadata["file_type"] = "generated"
doc.metadata["strategy"] = "flattened"
# 5. UPLOAD
emb_fn = get_embedding_func(embed_model_name)
vstore = pm.get_vectorstore(index_name, emb_fn, namespace=username)
custom_ids = [f"{clean_source}_{i}" for i, _ in enumerate(docs)]
vstore.add_documents(docs, ids=custom_ids)
return True, f"Updated: {clean_source} ({len(docs)} chunks)"
except Exception as e:
logger.error(f"Error indexing text: {e}")
return False, str(e)
def search_knowledge_base(query: str, username: str, index_name: str, embed_model_name: str, k: int = 5, final_k: int = 5):
if not PINECONE_KEY or not index_name: return []
try:
pm = PineconeManager(PINECONE_KEY)
emb_fn = get_embedding_func(embed_model_name)
vstore = pm.get_vectorstore(index_name, emb_fn, namespace=username)
broad_k = final_k * 3
initial_docs = vstore.similarity_search(query, k=broad_k)
if not initial_docs or not reranker:
return initial_docs[:final_k]
passages = [{"id": str(i), "text": doc.page_content, "meta": doc.metadata} for i, doc in enumerate(initial_docs)]
rerank_request = RerankRequest(query=query, passages=passages)
ranked_results = reranker.rerank(rerank_request)
final_docs = []
for res in ranked_results[:final_k]:
meta = res.get("meta", {})
meta["rerank_score"] = res.get("score")
final_docs.append(Document(page_content=res["text"], metadata=meta))
return final_docs
except Exception as e:
logger.error(f"Search failed: {e}")
return []
def delete_document(username: str, filename: str, index_name: str):
user_dir = os.path.join(UPLOAD_DIR, username)
file_path = os.path.join(user_dir, filename)
if os.path.exists(file_path): os.remove(file_path)
if PINECONE_KEY and index_name:
try:
pm = PineconeManager(PINECONE_KEY)
pm.delete_file(index_name, filename, namespace=username)
except Exception as e:
logger.error(f"Pinecone delete failed: {e}")
def list_documents(username: str) -> List[dict]:
user_dir = os.path.join(UPLOAD_DIR, username)
if not os.path.exists(user_dir): return []
return [{"filename": f, "source": f} for f in os.listdir(user_dir) if f.lower().endswith(('.txt', '.md', '.pdf', '.docx'))]
def rebuild_cache_from_pinecone(username: str, index_name: str) -> Tuple[bool, str]:
if not PINECONE_KEY or not index_name: return False, "Pinecone config missing."
try:
pm = PineconeManager(PINECONE_KEY)
ids = pm.get_all_ids(index_name, username)
if not ids: return False, "No data found in Pinecone."
user_dir = os.path.join(UPLOAD_DIR, username)
# We wipe it clean first
if os.path.exists(user_dir): shutil.rmtree(user_dir)
os.makedirs(user_dir, exist_ok=True)
batch_size = 100
reconstructed_files = {}
for i in range(0, len(ids), batch_size):
batch_ids = ids[i : i + batch_size]
response = pm.fetch_vectors(index_name, batch_ids, username)
vectors = response.vectors
for vec_id, vec_data in vectors.items():
meta = vec_data.metadata or {}
# THE RESYNC FIX: Force basename to avoid "dir/dir/file" bugs
raw_source = meta.get('source', 'unknown.txt')
source = os.path.basename(raw_source)
text = meta.get('text') or meta.get('page_content') or ''
try:
if "_" in vec_id: chunk_index = int(vec_id.rsplit('_', 1)[-1])
else: chunk_index = 0
except ValueError: chunk_index = 0
if source not in reconstructed_files: reconstructed_files[source] = []
reconstructed_files[source].append((chunk_index, text))
count = 0
for filename, chunks in reconstructed_files.items():
chunks.sort(key=lambda x: x[0])
full_text = "\n\n".join([c[1] for c in chunks])
file_path = os.path.join(user_dir, filename)
with open(file_path, "w", encoding="utf-8") as f: f.write(full_text)
count += 1
return True, f"Restored {count} files from Pinecone!"
except Exception as e:
logger.error(f"Cache rebuild failed: {e}")
return False, str(e) |