Spaces:

ruisp
/

MonPol

Runtime error

App Files Files

MonPol / filterminutes.py

ruisp

Some frontend cosmetics and refactoring.

42a39da almost 3 years ago

raw

history blame

3.12 kB

	import logging
	import gradio as gr
	import numpy as np

	log = logging.getLogger('filter methods')
	logging.basicConfig(level=logging.INFO)


	def filter_docs_by_meta(docs, filter_dict):
	"""
	Filter documents by multiple parameters
	Parameters:
	docs : List[langchain.schema.Document]
	filter_dict : Dict[str, Any]

	Returns: List of filtered documents

	Examples:
	docs = [langchain.schema.Document(metadata={'a': 1, 'b': 2}, text='text1')
	langchain.schema.Document(metadata={'a': 1, 'b': 3}, text='text2')]
	filter_dict = {'a': 1}
	filter_docs_by_meta(docs, filter_dict)
	[langchain.schema.Document(metadata={'a': 1, 'b': 2}, text='text1')]

	docs = [langchain.schema.Document(metadata={'a': 1, 'b': 2}, text='text1')
	langchain.schema.Document(metadata={'a': 1, 'b': 3}, text='text2')]
	filter_dict = {'a': 1, 'b': 2}
	filter_docs_by_meta(docs, filter_dict)
	[langchain.schema.Document(metadata={'a': 1, 'b': 2}, text='text1')]

	"""
	filtered_docs = []
	for doc in docs:
	append = True
	for key, value in filter_dict.items():
	if doc.metadata[key] != int(value):
	append = False
	break
	if append:
	filtered_docs.append(doc)
	return filtered_docs


	def search_with_filter(vector_store, query, filter_dict, target_k=5, init_k=100, step=50):
	"""
	Expand search with filter until reaching at least a pre-determined number of documents.
	----------
	Parameters
	vector_store : langchain.vectorstores.FAISS
	The FAISS vector store.
	query : str
	The query to search for.
	filter_dict : Dict[str, Any]
	The parameters to filer for
	target_k : int
	The minimum number of documents desired after filtering
	init_k : int
	The top-k documents to extract for the initial search.
	step : int
	The size of the step when enlarging the search.

	Returns: List of at least target_k Documents for post-processing.

	"""
	context = filter_docs_by_meta(vector_store.similarity_search(query, k=init_k), filter_dict)
	len_docs_begin = len(context)
	if len_docs_begin >= target_k:
	log.info(f'Initial search contains {len_docs_begin} documents. Expansion not required. ')
	return context
	MAX_K = 50000 # This is more than the number of actual documents.
	for top_k_docs in np.arange(init_k, MAX_K, step):
	log.info(f'Context contains {len(context)} documents')
	log.info(f'Expanding search with k={top_k_docs}')
	context = filter_docs_by_meta(vector_store.similarity_search(query, k=int(top_k_docs)), filter_dict)
	if len(context) >= target_k:
	log.info(f'Success. Context contains {len(context)} documents matching the filtering criteria')
	return context
	log.info(f'Failed to reach target number of documents,'
	f' context contains {len(context)} documents matching the filtering criteria')
	return context