File size: 3,471 Bytes
b02630d
 
 
 
 
 
 
 
ee0f8f3
b02630d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
# files/file_manager.py

from typing import Dict, List
from pathlib import Path
import shutil

from document_processing.processor import DocumentProcessor
from vectorstore.store import VectorStore
from langchain_core.documents import Document


class FileWorkspace:
    """
    Holds vector store + metadata for one workspace's uploaded docs.
    """
    def __init__(self, workspace_id: str, base_dir: str = "workspace_data"):
        self.workspace_id = workspace_id
        self.base_dir = Path(base_dir) / workspace_id
        self.base_dir.mkdir(parents=True, exist_ok=True)

        self.processor = DocumentProcessor()
        self.vector = VectorStore()
        self.initialized = False
        self.files: List[str] = []  # filenames

    def add_files(self, uploaded_paths: List[Path]):
        """
        Index newly uploaded files into the workspace vector store.
        """
        docs: List[Document] = []

        for p in uploaded_paths:
            try:
                if p.suffix.lower() == ".pdf":
                    docs.extend(self.processor.load_pdf(str(p)))
                elif p.suffix.lower() in [".txt", ".md"]:
                    docs.extend(self.processor.load_txt(str(p)))
                elif p.suffix.lower() in [".ppt", ".pptx"]:
                    # Use UnstructuredPowerPointLoader if available
                    try:
                        from langchain_community.document_loaders import UnstructuredPowerPointLoader
                        loader = UnstructuredPowerPointLoader(str(p))
                        docs.extend(loader.load())
                    except ImportError:
                        # Fallback: read as binary and extract text
                        print(f"UnstructuredPowerPointLoader not available for {p.name}")
                        continue
                
                self.files.append(p.name)
            except Exception as e:
                print(f"Error loading file {p.name}: {e}")
                continue

        if not docs:
            return

        # Add file path metadata
        for doc in docs:
            doc.metadata["file_path"] = str(p)
            doc.metadata["source"] = p.name

        chunks = self.processor.split(docs)

        if not self.initialized:
            self.vector.create(chunks)
            self.initialized = True
        else:
            self.vector.store.add_documents(chunks)

    def retrieve(self, query: str, k: int = 6):
        if not self.initialized:
            return []
        return self.vector.retrieve(query, k=k)


class FileManager:
    """
    Keeps a map: workspace_id -> FileWorkspace
    """
    def __init__(self, base_dir: str = "workspace_data"):
        self.base_dir = base_dir
        self._workspaces: Dict[str, FileWorkspace] = {}

    def get_workspace(self, workspace_id: str) -> FileWorkspace:
        if workspace_id not in self._workspaces:
            self._workspaces[workspace_id] = FileWorkspace(workspace_id, self.base_dir)
        return self._workspaces[workspace_id]

    def clear_workspace(self, workspace_id: str):
        ws_dir = Path(self.base_dir) / workspace_id
        if ws_dir.exists():
            shutil.rmtree(ws_dir)
        if workspace_id in self._workspaces:
            del self._workspaces[workspace_id]

    def get_files(self, workspace_id: str) -> List[str]:
        if workspace_id in self._workspaces:
            return self._workspaces[workspace_id].files
        return []