perplexity-clone / files /file_manager.py
Naveen-2007's picture
Fix: LangChain 0.2.x compatibility - update imports and model config
ee0f8f3
# files/file_manager.py
from typing import Dict, List
from pathlib import Path
import shutil
from document_processing.processor import DocumentProcessor
from vectorstore.store import VectorStore
from langchain_core.documents import Document
class FileWorkspace:
"""
Holds vector store + metadata for one workspace's uploaded docs.
"""
def __init__(self, workspace_id: str, base_dir: str = "workspace_data"):
self.workspace_id = workspace_id
self.base_dir = Path(base_dir) / workspace_id
self.base_dir.mkdir(parents=True, exist_ok=True)
self.processor = DocumentProcessor()
self.vector = VectorStore()
self.initialized = False
self.files: List[str] = [] # filenames
def add_files(self, uploaded_paths: List[Path]):
"""
Index newly uploaded files into the workspace vector store.
"""
docs: List[Document] = []
for p in uploaded_paths:
try:
if p.suffix.lower() == ".pdf":
docs.extend(self.processor.load_pdf(str(p)))
elif p.suffix.lower() in [".txt", ".md"]:
docs.extend(self.processor.load_txt(str(p)))
elif p.suffix.lower() in [".ppt", ".pptx"]:
# Use UnstructuredPowerPointLoader if available
try:
from langchain_community.document_loaders import UnstructuredPowerPointLoader
loader = UnstructuredPowerPointLoader(str(p))
docs.extend(loader.load())
except ImportError:
# Fallback: read as binary and extract text
print(f"UnstructuredPowerPointLoader not available for {p.name}")
continue
self.files.append(p.name)
except Exception as e:
print(f"Error loading file {p.name}: {e}")
continue
if not docs:
return
# Add file path metadata
for doc in docs:
doc.metadata["file_path"] = str(p)
doc.metadata["source"] = p.name
chunks = self.processor.split(docs)
if not self.initialized:
self.vector.create(chunks)
self.initialized = True
else:
self.vector.store.add_documents(chunks)
def retrieve(self, query: str, k: int = 6):
if not self.initialized:
return []
return self.vector.retrieve(query, k=k)
class FileManager:
"""
Keeps a map: workspace_id -> FileWorkspace
"""
def __init__(self, base_dir: str = "workspace_data"):
self.base_dir = base_dir
self._workspaces: Dict[str, FileWorkspace] = {}
def get_workspace(self, workspace_id: str) -> FileWorkspace:
if workspace_id not in self._workspaces:
self._workspaces[workspace_id] = FileWorkspace(workspace_id, self.base_dir)
return self._workspaces[workspace_id]
def clear_workspace(self, workspace_id: str):
ws_dir = Path(self.base_dir) / workspace_id
if ws_dir.exists():
shutil.rmtree(ws_dir)
if workspace_id in self._workspaces:
del self._workspaces[workspace_id]
def get_files(self, workspace_id: str) -> List[str]:
if workspace_id in self._workspaces:
return self._workspaces[workspace_id].files
return []