Spaces:
Runtime error
Runtime error
| # Import necessary modules and classes | |
| from colpali_manager import ColpaliManager # Manages processing of images and text with the ColPali model | |
| from milvus_manager import MilvusManager # Manages interactions with the Milvus database | |
| from pdf_manager import PdfManager # Handles PDF processing tasks | |
| import hashlib # Library for creating hashed identifiers | |
| # Initialize managers | |
| pdf_manager = PdfManager() # PDF manager instance for handling PDF-related operations | |
| colpali_manager = ColpaliManager() # ColPali manager instance for processing images and text | |
| class Middleware: | |
| """ | |
| Middleware class that integrates PDF processing, image embedding, and database indexing/searching. | |
| """ | |
| def __init__(self, id: str, create_collection=True): | |
| """ | |
| Initialize the Middleware with a unique identifier and Milvus database setup. | |
| Args: | |
| id (str): Unique identifier for the user/session. | |
| create_collection (bool): Whether to create a new collection in the Milvus database. | |
| """ | |
| # Generate a hashed ID for the Milvus database name | |
| hashed_id = hashlib.md5(id.encode()).hexdigest()[:8] | |
| milvus_db_name = f"milvus_{hashed_id}.db" | |
| # Initialize the Milvus manager with the generated database name | |
| self.milvus_manager = MilvusManager(milvus_db_name, "colpali", create_collection) | |
| def index(self, pdf_path: str, id: str, max_pages: int, pages: list[int] = None): | |
| """ | |
| Index the content of a PDF file into the Milvus database. | |
| Args: | |
| pdf_path (str): Path to the PDF file. | |
| id (str): Unique identifier for the session. | |
| max_pages (int): Maximum number of pages to extract and index. | |
| pages (list[int], optional): Specific pages to extract (default is None for all). | |
| Returns: | |
| list[str]: List of paths to the saved image files. | |
| """ | |
| print(f"Indexing {pdf_path}, id: {id}, max_pages: {max_pages}") | |
| # Convert PDF pages into image files and save them | |
| image_paths = pdf_manager.save_images(id, pdf_path, max_pages) | |
| print(f"Saved {len(image_paths)} images") | |
| # Generate image embeddings using the ColPali model | |
| colbert_vecs = colpali_manager.process_images(image_paths) | |
| # Prepare data for insertion into Milvus | |
| images_data = [{ | |
| "colbert_vecs": colbert_vecs[i], # Image embeddings | |
| "filepath": image_paths[i] # Corresponding image file path | |
| } for i in range(len(image_paths))] | |
| print(f"Inserting {len(images_data)} images data to Milvus") | |
| # Insert the image data into the Milvus database | |
| self.milvus_manager.insert_images_data(images_data) | |
| print("Indexing completed") | |
| return image_paths # Return the list of saved image paths | |
| def search(self, search_queries: list[str]): | |
| """ | |
| Search for matching results in the indexed database based on text queries. | |
| Args: | |
| search_queries (list[str]): List of search queries. | |
| Returns: | |
| list: Search results for each query. | |
| """ | |
| print(f"Searching for {len(search_queries)} queries") | |
| final_res = [] # List to store the final search results | |
| for query in search_queries: | |
| print(f"Searching for query: {query}") | |
| # Process the query text to generate an embedding | |
| query_vec = colpali_manager.process_text([query])[0] | |
| # Perform the search in the Milvus database | |
| search_res = self.milvus_manager.search(query_vec, topk=1) | |
| print(f"Search result: {search_res} for query: {query}") | |
| # Append the search results to the final results list | |
| final_res.append(search_res) | |
| return final_res # Return all search results | |