Spaces:
Runtime error
Runtime error
| import logging | |
| import gradio as gr | |
| import numpy as np | |
| log = logging.getLogger('filter methods') | |
| logging.basicConfig(level=logging.INFO) | |
| def filter_docs_by_meta(docs, filter_dict): | |
| """ | |
| Filter documents by multiple parameters | |
| Parameters: | |
| docs : List[langchain.schema.Document] | |
| filter_dict : Dict[str, Any] | |
| Returns: List of filtered documents | |
| Examples: | |
| docs = [langchain.schema.Document(metadata={'a': 1, 'b': 2}, text='text1') | |
| langchain.schema.Document(metadata={'a': 1, 'b': 3}, text='text2')] | |
| filter_dict = {'a': 1} | |
| filter_docs_by_meta(docs, filter_dict) | |
| [langchain.schema.Document(metadata={'a': 1, 'b': 2}, text='text1')] | |
| docs = [langchain.schema.Document(metadata={'a': 1, 'b': 2}, text='text1') | |
| langchain.schema.Document(metadata={'a': 1, 'b': 3}, text='text2')] | |
| filter_dict = {'a': 1, 'b': 2} | |
| filter_docs_by_meta(docs, filter_dict) | |
| [langchain.schema.Document(metadata={'a': 1, 'b': 2}, text='text1')] | |
| """ | |
| filtered_docs = [] | |
| for doc in docs: | |
| append = True | |
| for key, value in filter_dict.items(): | |
| if doc.metadata[key] != int(value): | |
| append = False | |
| break | |
| if append: | |
| filtered_docs.append(doc) | |
| return filtered_docs | |
| def search_with_filter(vector_store, query, filter_dict, target_k=5, init_k=100, step=50): | |
| """ | |
| Expand search with filter until reaching at least a pre-determined number of documents. | |
| ---------- | |
| Parameters | |
| vector_store : langchain.vectorstores.FAISS | |
| The FAISS vector store. | |
| query : str | |
| The query to search for. | |
| filter_dict : Dict[str, Any] | |
| The parameters to filer for | |
| target_k : int | |
| The minimum number of documents desired after filtering | |
| init_k : int | |
| The top-k documents to extract for the initial search. | |
| step : int | |
| The size of the step when enlarging the search. | |
| Returns: List of at least target_k Documents for post-processing. | |
| """ | |
| context = filter_docs_by_meta(vector_store.similarity_search(query, k=init_k), filter_dict) | |
| len_docs_begin = len(context) | |
| if len_docs_begin >= target_k: | |
| log.info(f'Initial search contains {len_docs_begin} documents. Expansion not required. ') | |
| return context | |
| MAX_K = 50000 # This is more than the number of actual documents. | |
| for top_k_docs in np.arange(init_k, MAX_K, step): | |
| log.info(f'Context contains {len(context)} documents') | |
| log.info(f'Expanding search with k={top_k_docs}') | |
| context = filter_docs_by_meta(vector_store.similarity_search(query, k=int(top_k_docs)), filter_dict) | |
| if len(context) >= target_k: | |
| log.info(f'Success. Context contains {len(context)} documents matching the filtering criteria') | |
| return context | |
| log.info(f'Failed to reach target number of documents,' | |
| f' context contains {len(context)} documents matching the filtering criteria') | |
| return context | |