Spaces:
Runtime error
Runtime error
| from ._utils import FewDocumentsError | |
| from ._utils import document_extraction, paragraph_extraction, semantic_search | |
| from corpora import gen_corpus | |
| from nltk.corpus import stopwords | |
| from nltk.tokenize import word_tokenize | |
| import string | |
| def extract(query: str, search_model, n: int=3, extracted_documents: list=None) -> str: | |
| """Extract n paragraphs from the corpus using the given query. | |
| Parameters: | |
| query (str): Sentence used to search the corpus for relevant documents | |
| n (int): Number of paragraphs to return | |
| Returns: | |
| str: String containing the n most relevant paragraphs joined by line breaks | |
| """ | |
| # Open corpus | |
| corpus = gen_corpus(query) | |
| # Setup query | |
| stop_words = set(stopwords.words('english')) | |
| query_tokens = word_tokenize(query.lower()) | |
| tokens_without_sw = [word for word in query_tokens if not word in stop_words] | |
| keywords = [keyword for keyword in tokens_without_sw if keyword not in string.punctuation] | |
| # Gross search | |
| if not extracted_documents: | |
| extracted_documents, documents_empty, documents_sizes = document_extraction( | |
| dataset=corpus, | |
| query=query, | |
| keywords=keywords, | |
| min_document_size=0, | |
| min_just_one_paragraph_size=0 | |
| ) | |
| # First semantc search (over documents) | |
| selected_documents, documents_distances = semantic_search( | |
| model=search_model, | |
| query=query, | |
| files=extracted_documents, | |
| number_of_similar_files=10 | |
| ) | |
| # Second semantic search (over paragraphs) | |
| paragraphs = paragraph_extraction( | |
| documents=selected_documents, | |
| min_paragraph_size=20, | |
| ) | |
| # Model for the second semantic search | |
| selected_paragraphs, paragraphs_distances = semantic_search( | |
| model=search_model, | |
| query=query, | |
| files=paragraphs, | |
| number_of_similar_files=10 | |
| ) | |
| text = '\n'.join(selected_paragraphs[:n]) | |
| return text | |