Spaces:
Sleeping
Sleeping
ernani
commited on
Commit
·
fb72cf5
1
Parent(s):
5e9938c
Removed chromadb from tools
Browse files
tools.py
CHANGED
|
@@ -8,8 +8,6 @@ from langchain.text_splitter import RecursiveCharacterTextSplitter
|
|
| 8 |
from langchain_community.tools import WikipediaQueryRun, DuckDuckGoSearchRun
|
| 9 |
from langchain_community.document_loaders import PythonLoader
|
| 10 |
from langchain_community.utilities import WikipediaAPIWrapper
|
| 11 |
-
import chromadb
|
| 12 |
-
from chromadb.config import Settings
|
| 13 |
import pytube
|
| 14 |
from PIL import Image
|
| 15 |
import pandas as pd
|
|
@@ -709,76 +707,3 @@ class WebSearchTool(BaseTool):
|
|
| 709 |
|
| 710 |
except Exception as e:
|
| 711 |
return f"Error searching the web: {str(e)}"
|
| 712 |
-
|
| 713 |
-
class ChromaDBManager:
|
| 714 |
-
"""Manager for ChromaDB operations"""
|
| 715 |
-
def __init__(self, persist_directory: str = "./chroma_db"):
|
| 716 |
-
self.persist_directory = persist_directory
|
| 717 |
-
self.client = chromadb.Client(Settings(
|
| 718 |
-
persist_directory=persist_directory,
|
| 719 |
-
is_persistent=True
|
| 720 |
-
))
|
| 721 |
-
|
| 722 |
-
def create_collection(self, name: str):
|
| 723 |
-
"""Create a new collection or get existing one"""
|
| 724 |
-
try:
|
| 725 |
-
return self.client.create_collection(name=name)
|
| 726 |
-
except ValueError:
|
| 727 |
-
return self.client.get_collection(name=name)
|
| 728 |
-
|
| 729 |
-
def _generate_document_id(self, content: str, metadata: dict) -> str:
|
| 730 |
-
"""Generate a unique ID for a document based on its content and metadata"""
|
| 731 |
-
# Use content and key metadata fields for ID generation
|
| 732 |
-
id_parts = [content[:100]] # First 100 chars of content
|
| 733 |
-
if metadata:
|
| 734 |
-
source = metadata.get('source', '')
|
| 735 |
-
doc_type = metadata.get('type', '')
|
| 736 |
-
if source:
|
| 737 |
-
id_parts.append(str(source))
|
| 738 |
-
if doc_type:
|
| 739 |
-
id_parts.append(str(doc_type))
|
| 740 |
-
|
| 741 |
-
# Generate hash from combined parts
|
| 742 |
-
combined = "_".join(id_parts)
|
| 743 |
-
return f"doc_{hash(combined)}"
|
| 744 |
-
|
| 745 |
-
def add_documents_with_metadata(self, collection_name: str, documents: List[str], metadatas: List[dict]):
|
| 746 |
-
"""Add documents with their metadata to a collection"""
|
| 747 |
-
if not documents or not metadatas or len(documents) != len(metadatas):
|
| 748 |
-
raise ValueError("Invalid documents or metadata")
|
| 749 |
-
|
| 750 |
-
collection = self.create_collection(collection_name)
|
| 751 |
-
|
| 752 |
-
# Generate unique IDs for documents
|
| 753 |
-
ids = [self._generate_document_id(doc, meta)
|
| 754 |
-
for doc, meta in zip(documents, metadatas)]
|
| 755 |
-
|
| 756 |
-
try:
|
| 757 |
-
# First try to add documents
|
| 758 |
-
collection.add(
|
| 759 |
-
documents=documents,
|
| 760 |
-
metadatas=metadatas,
|
| 761 |
-
ids=ids
|
| 762 |
-
)
|
| 763 |
-
except Exception as e:
|
| 764 |
-
# If documents exist, update them
|
| 765 |
-
logging.info(f"Updating existing documents in collection {collection_name}")
|
| 766 |
-
collection.upsert(
|
| 767 |
-
documents=documents,
|
| 768 |
-
metadatas=metadatas,
|
| 769 |
-
ids=ids
|
| 770 |
-
)
|
| 771 |
-
|
| 772 |
-
def query_collection(self, collection_name: str, query: str, n_results: int = 5) -> Dict:
|
| 773 |
-
"""Query a collection with improved retrieval"""
|
| 774 |
-
try:
|
| 775 |
-
collection = self.client.get_collection(collection_name)
|
| 776 |
-
results = collection.query(
|
| 777 |
-
query_texts=[query],
|
| 778 |
-
n_results=n_results
|
| 779 |
-
)
|
| 780 |
-
|
| 781 |
-
return results
|
| 782 |
-
except Exception as e:
|
| 783 |
-
logging.error(f"Error querying collection {collection_name}: {str(e)}")
|
| 784 |
-
return {"documents": [], "metadatas": [], "distances": []}
|
|
|
|
| 8 |
from langchain_community.tools import WikipediaQueryRun, DuckDuckGoSearchRun
|
| 9 |
from langchain_community.document_loaders import PythonLoader
|
| 10 |
from langchain_community.utilities import WikipediaAPIWrapper
|
|
|
|
|
|
|
| 11 |
import pytube
|
| 12 |
from PIL import Image
|
| 13 |
import pandas as pd
|
|
|
|
| 707 |
|
| 708 |
except Exception as e:
|
| 709 |
return f"Error searching the web: {str(e)}"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|