final_project2

Sleeping

App Files Files Community

dnj0 commited on Nov 19, 2025

Commit

f84a554

1 Parent(s): 34bfedc

Simplify

Browse files

Files changed (2) hide show

src/config.py +10 -17
src/vector_store.py +161 -49

src/config.py CHANGED Viewed

@@ -2,37 +2,30 @@ import os
 from pathlib import Path
 OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "")
-OPENAI_MODEL = "gpt-4o-mini"
-USE_CACHE = True
 CHROMA_DB_PATH = "./chroma_db"
 DOCSTORE_PATH = "./docstore"
 PROCESSED_FILES_LOG = "./processed_files.txt"
 EMBEDDING_MODEL = "sentence-transformers/all-mpnet-base-v2"
 EMBEDDING_DIM = 768
-MAX_CHUNK_SIZE = 500
-CHUNK_OVERLAP = 50
-TEMPERATURE = 0.3
-MAX_TOKENS = 500
 LANGUAGE = "russian"
 Path(CHROMA_DB_PATH).mkdir(exist_ok=True)
 Path(DOCSTORE_PATH).mkdir(exist_ok=True)
 UPLOAD_FOLDER = "./uploaded_pdfs"
 Path(UPLOAD_FOLDER).mkdir(exist_ok=True)
-MAX_PDF_SIZE_MB = 50

 from pathlib import Path
 OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "")
+OPENAI_MODEL = "gpt-4o-mini"  # Cheaper model variant
+USE_CACHE = True  # Enable response caching
 CHROMA_DB_PATH = "./chroma_db"
 DOCSTORE_PATH = "./docstore"
 PROCESSED_FILES_LOG = "./processed_files.txt"
 EMBEDDING_MODEL = "sentence-transformers/all-mpnet-base-v2"
 EMBEDDING_DIM = 768
+MAX_CHUNK_SIZE = 500  # Smaller chunks = fewer tokens
+CHUNK_OVERLAP = 50    # Less overlap = fewer chunks
+TEMPERATURE = 0.3     # Lower = faster, cheaper
+MAX_TOKENS = 500      # Limit response size (vs 1500)
 LANGUAGE = "russian"
 Path(CHROMA_DB_PATH).mkdir(exist_ok=True)
 Path(DOCSTORE_PATH).mkdir(exist_ok=True)
 UPLOAD_FOLDER = "./uploaded_pdfs"
 Path(UPLOAD_FOLDER).mkdir(exist_ok=True)
+MAX_PDF_SIZE_MB = 50
+BATCH_SEARCH_RESULTS = 3
+CACHE_RESPONSES = True
+SUMMARIZE_FIRST = True

src/vector_store.py CHANGED Viewed

@@ -1,88 +1,200 @@
 import os
 from typing import List, Dict
-from chromadb.config import Settings
 import chromadb
-from config import CHROMA_DB_PATH
 class VectorStore:
     def __init__(self):
-        self.chroma_path = CHROMA_DB_PATH
-        self.settings = Settings(
-            chroma_db_impl_embed_collection_mixin=True,
-            persist_directory=self.chroma_path,
-            anonymized_telemetry=False,
-            allow_reset=True,
-        )
-        self.client = chromadb.Client(self.settings)
-        self.collection = self.client.get_or_create_collection(
-            name="documents",
-            metadata={"hnsw:space": "cosine"}
-        )
-    def add_documents(self, documents: Dict, doc_id: str):
         try:
-            text = documents.get('text', '')
-            if not text or len(text.strip()) < 1:
-                print(f"Empty text for {doc_id}")
-                return
-            self.collection.add(
-                ids=[doc_id],
-                documents=[text],
-                metadatas=[{
-                    'doc_id': doc_id,
-                    'source': 'pdf_document'
-                }]
             )
-            print(f"Added document to vector store: {doc_id}")
         except Exception as e:
-            print(f"Error adding documents to vector store: {e}")
-            raise
     def search(self, query: str, n_results: int = 5) -> List[Dict]:
         try:
             results = self.collection.query(
-                query_texts=[query],
-                n_results=n_results,
-                include=['documents', 'metadatas', 'distances', 'embeddings']
             )
             formatted_results = []
-            if results and results['documents'] and len(results['documents']) > 0:
-                for idx, doc in enumerate(results['documents'][0]):
-                    distance = results['distances'][0][idx] if results['distances'] else 0
                     formatted_results.append({
                         'content': doc,
-                        'metadata': results['metadatas'][0][idx] if results['metadatas'] else {},
                         'distance': distance,
-                        'type': 'document'
                     })
             return formatted_results
         except Exception as e:
             print(f"Error searching vector store: {e}")
             return []
     def get_collection_info(self) -> Dict:
         try:
             count = self.collection.count()
             return {
                 'count': count,
-                'status': 'ready',
-                'persist_path': self.chroma_path
             }
         except Exception as e:
             print(f"Error getting collection info: {e}")
-            return {
-                'count': 0,
-                'status': 'error',
-                'persist_path': self.chroma_path
-            }
     def clear_all(self):
         try:
-            self.client.delete_collection(name="documents")
             self.collection = self.client.get_or_create_collection(
-                name="documents",
                 metadata={"hnsw:space": "cosine"}
             )
-            print("Vector store cleared")
         except Exception as e:
-            print(f"Error clearing vector store: {e}")

 import os
+import json
 from typing import List, Dict
 import chromadb
+from sentence_transformers import SentenceTransformer
+import numpy as np
+from config import CHROMA_DB_PATH, EMBEDDING_MODEL, EMBEDDING_DIM
+class CLIPEmbedder:
+    def __init__(self, model_name: str = EMBEDDING_MODEL):
+        print(f"Loading embedding model: {model_name}")
+        self.model = SentenceTransformer(model_name)
+        print(f"Model loaded successfully")
+    def embed(self, text: str) -> List[float]:
+        try:
+            embedding = self.model.encode(text, convert_to_numpy=False)
+            return embedding.tolist() if hasattr(embedding, 'tolist') else embedding
+        except Exception as e:
+            print(f"Error embedding text: {e}")
+            return [0.0] * EMBEDDING_DIM
+    def embed_batch(self, texts: List[str]) -> List[List[float]]:
+        try:
+            embeddings = self.model.encode(texts, convert_to_numpy=False)
+            return [e.tolist() if hasattr(e, 'tolist') else e for e in embeddings]
+        except Exception as e:
+            print(f"Error embedding batch: {e}")
+            return [[0.0] * EMBEDDING_DIM] * len(texts)
 class VectorStore:
     def __init__(self):
+        self.persist_directory = CHROMA_DB_PATH
+        self.embedder = CLIPEmbedder()
+        print(f"Initializing ChromaDB at: {self.persist_directory}")
         try:
+            self.client = chromadb.PersistentClient(
+                path=self.persist_directory
             )
+            print(f"ChromaDB initialized")
         except Exception as e:
+            print(f"Error initializing ChromaDB: {e}")
+            self.client = chromadb.PersistentClient(
+                path=self.persist_directory
+            )
+        try:
+            self.collection = self.client.get_or_create_collection(
+                name="multimodal_rag",
+                metadata={"hnsw:space": "cosine"}
+            )
+            count = self.collection.count()
+            print(f"Collection loaded: {count} items in store")
+        except Exception as e:
+            print(f"Error with collection: {e}")
+            self.collection = self.client.get_or_create_collection(
+                name="multimodal_rag"
+            )
+    def add_documents(self, documents: List[Dict], doc_id: str):
+        texts = []
+        metadatas = []
+        ids = []
+        print(f"Adding documents for: {doc_id}")
+        if 'text' in documents and documents['text']:
+            chunks = self._chunk_text(documents['text'], chunk_size=1000, overlap=200)
+            for idx, chunk in enumerate(chunks):
+                texts.append(chunk)
+                metadatas.append({
+                    'doc_id': doc_id,
+                    'type': 'text',
+                    'chunk_idx': str(idx)
+                })
+                ids.append(f"{doc_id}_text_{idx}")
+            print(f"Text: {len(chunks)} chunks")
+        if 'images' in documents:
+            image_count = 0
+            for idx, image_data in enumerate(documents['images']):
+                if image_data.get('ocr_text'):
+                    texts.append(f"Image {idx}: {image_data['ocr_text']}")
+                    metadatas.append({
+                        'doc_id': doc_id,
+                        'type': 'image',
+                        'image_idx': str(idx),
+                        'image_path': image_data.get('path', '')
+                    })
+                    ids.append(f"{doc_id}_image_{idx}")
+                    image_count += 1
+            if image_count > 0:
+                print(f"Images: {image_count} with OCR text")
+        if 'tables' in documents:
+            table_count = 0
+            for idx, table_data in enumerate(documents['tables']):
+                if table_data.get('content'):
+                    texts.append(f"Table {idx}: {table_data.get('content', '')}")
+                    metadatas.append({
+                        'doc_id': doc_id,
+                        'type': 'table',
+                        'table_idx': str(idx)
+                    })
+                    ids.append(f"{doc_id}_table_{idx}")
+                    table_count += 1
+            if table_count > 0:
+                print(f"Tables: {table_count}")
+        if texts:
+            print(f"Generating {len(texts)} embeddings...")
+            embeddings = self.embedder.embed_batch(texts)
+            try:
+                self.collection.add(
+                    ids=ids,
+                    documents=texts,
+                    embeddings=embeddings,
+                    metadatas=metadatas
+                )
+                print(f"Successfully added {len(texts)} items to vector store")
+            except Exception as e:
+                print(f"Error adding to collection: {e}")
     def search(self, query: str, n_results: int = 5) -> List[Dict]:
         try:
+            query_embedding = self.embedder.embed(query)
             results = self.collection.query(
+                query_embeddings=[query_embedding],
+                n_results=n_results
             )
             formatted_results = []
+            if results['documents']:
+                for i, doc in enumerate(results['documents'][0]):
+                    metadata = results['metadatas'][0][i] if results['metadatas'] else {}
+                    distance = results['distances'][0][i] if results['distances'] else 0
                     formatted_results.append({
                         'content': doc,
+                        'metadata': metadata,
                         'distance': distance,
+                        'type': metadata.get('type', 'unknown')
                     })
             return formatted_results
         except Exception as e:
             print(f"Error searching vector store: {e}")
             return []
+    def _chunk_text(self, text: str, chunk_size: int = 1000, overlap: int = 200) -> List[str]:
+        chunks = []
+        start = 0
+        while start < len(text):
+            end = start + chunk_size
+            chunks.append(text[start:end])
+            start = end - overlap
+        return chunks
     def get_collection_info(self) -> Dict:
         try:
             count = self.collection.count()
             return {
+                'name': 'multimodal_rag',
                 'count': count,
+                'status': 'active',
+                'persist_path': self.persist_directory
             }
         except Exception as e:
             print(f"Error getting collection info: {e}")
+            return {'status': 'error', 'message': str(e)}
+    def delete_by_doc_id(self, doc_id: str):
+        try:
+            results = self.collection.get(where={'doc_id': doc_id})
+            if results['ids']:
+                self.collection.delete(ids=results['ids'])
+                print(f"Deleted {len(results['ids'])} documents for {doc_id}")
+                print(f"Changes persisted automatically")
+        except Exception as e:
+            print(f"Error deleting documents: {e}")
+    def persist(self):
+        print("Vector store is using auto-persist")
     def clear_all(self):
         try:
+            self.client.delete_collection(name="multimodal_rag")
             self.collection = self.client.get_or_create_collection(
+                name="multimodal_rag",
                 metadata={"hnsw:space": "cosine"}
             )
+            print("Collection cleared and reset")
         except Exception as e:
+            print(f"Error clearing collection: {e}")