| from src.domain.block import Block |
| from src.domain.doc import Doc |
| from src.llm.llm_tools import summarize_paragraph_v2 |
| import gradio as gr |
|
|
| class Retriever: |
| """ |
| The Retriever class is responsible for processing and summarizing documents. |
| It supports operations such as summarizing individual blocks of text, organizing |
| text into a hierarchy, and conducting similarity searches within a collection of documents. |
| |
| Attributes: |
| collection: A collection object where summaries and metadata are stored. |
| llmagent: An instance of LlmAgent used for generating summaries. |
| """ |
|
|
|
|
| def __init__(self, doc: Doc = None, collection=None): |
| """ |
| Initializes the Retriever class with a document, a collection, and a language model agent. |
| |
| Args: |
| doc: A document object containing text blocks to be processed. |
| collection: A collection object to store summaries and metadata. |
| llmagent: An instance of LlmAgent for generating summaries. |
| """ |
|
|
| if doc is not None: |
| self.collection = collection |
| blocks_good_format = doc.blocks_requirements |
| gr.Info("Please wait while the database is being created") |
|
|
| |
| for block in blocks_good_format: |
| print(f"block index : {block.index}") |
| |
| if len(block.content) > 4500: |
| new_blocks = block.separate_1_block_in_n(max_size=4500) |
| for new_block in new_blocks: |
| summary = summarize_paragraph_v2(prompt=new_block.content, title_doc=doc.title, title_para=block.title) |
| if "<summary>" in summary: |
| summary = summary.split("<summary>")[1] |
| |
| self.collection.add( |
| documents=[summary], |
| ids=[new_block.index], |
| metadatas=[new_block.to_dict()] |
| ) |
| else: |
| |
| summary = summarize_paragraph_v2(prompt=block.content, title_doc=doc.title, title_para=block.title) |
| if "<summary>" in summary: |
| summary = summary.split("<summary>")[1] |
| self.collection.add( |
| documents=[summary], |
| ids=[block.index], |
| metadatas=[block.to_dict()] |
| ) |
| |
| |
| self.summarize_by_hierarchy(blocks_good_format, doc.title) |
| gr.Info(f"The collection {collection.name} has been added to the database") |
| else: |
| self.collection = collection |
| |
| |
| |
|
|
| def summarize_by_hierarchy(self, blocks, doc_title): |
| """ |
| Summarizes blocks based on their hierarchical levels. |
| |
| Args: |
| blocks: A list of Block objects to be summarized. |
| llmagent: An instance of LlmAgent used for generating summaries. |
| doc_title: The title of the document being processed. |
| """ |
| hierarchy = self.create_hierarchy(blocks) |
| deepest_blocks_indices = self.find_deepest_blocks(blocks) |
| print("Hierarchy levels identified:", hierarchy.keys()) |
| print("Deepest block indices:", [block.index for block in deepest_blocks_indices]) |
| |
| for level, level_blocks in hierarchy.items(): |
| |
| print(level) |
| print(level_blocks) |
| print(deepest_blocks_indices) |
| print(len(level_blocks)) |
| if len(level_blocks) > 1 and any(block.index in deepest_blocks_indices for block in level_blocks): |
| level_content = " ".join(block.content for block in level_blocks) |
| |
| print(f"Summarizing level {level} with content from blocks: {[block.index for block in level_blocks]}") |
| level_summary = summarize_paragraph_v2(prompt=level_content, title_doc=doc_title, title_para=f"Summary of section : {level}") |
| |
| level_summary_id = f"summary_{level}" |
| |
| |
| first_block = level_blocks[0] |
| combined_block = Block( |
| doc=first_block.doc, |
| title=first_block.title, |
| content=" ".join(block.content for block in level_blocks), |
| index=first_block.index, |
| rank=first_block.rank, |
| level=first_block.level, |
| distance=first_block.distance |
| ) |
|
|
|
|
| self.collection.add( |
| documents=[level_summary], |
| ids=[level_summary_id], |
| metadatas=[combined_block.to_dict()] |
| ) |
| |
| |
| print(f"Added summary for level {level} to the collection.") |
| else: |
| |
| print(f"Skipping level {level} as it is deepest blocks.") |
|
|
|
|
| def create_hierarchy(self, blocks): |
| """ |
| Creates a hierarchical structure of the blocks based on their indices. |
| |
| Args: |
| blocks: A list of Block objects to be organized into a hierarchy. |
| |
| Returns: |
| A dictionary representing the hierarchy of blocks. |
| """ |
| hierarchy = {} |
| for block in blocks: |
| levels = self.extract_levels(block.index) |
| for level in levels: |
| hierarchy.setdefault(level, []).append(block) |
| return hierarchy |
|
|
|
|
| def extract_levels(self, index): |
| """ |
| Extracts all hierarchical levels from a block index. |
| |
| Args: |
| index: The index string of a block. |
| |
| Returns: |
| A list of levels extracted from the index. |
| """ |
| |
| parts = index.split('.') |
| levels = ['.'.join(parts[:i]) for i in range(1, len(parts) + 1)] |
| return levels |
| |
| |
| def find_deepest_blocks(self, blocks): |
| """ |
| Identifies the deepest blocks in the hierarchy. |
| |
| Args: |
| blocks: A list of Block objects. |
| |
| Returns: |
| A set of indices representing the deepest blocks. |
| """ |
| deepest_blocks = set() |
| block_indices = {block.index for block in blocks} |
| for block in blocks: |
| |
| if not any(b_index != block.index and b_index.startswith(block.index + '.') for b_index in block_indices): |
| deepest_blocks.add(block.index) |
| return deepest_blocks |
|
|
|
|
|
|
| def similarity_search(self, queries: str) -> {}: |
| """ |
| Performs a similarity search in the collection based on given queries. |
| |
| Args: |
| queries: A string or list of strings representing the query or queries. |
| |
| Returns: |
| A list of Block objects that are similar to the given queries. |
| """ |
| |
| res = self.collection.query(query_texts=queries, n_results=5) |
| block_dict_sources = res['metadatas'][0] |
| distances = res['distances'][0] |
| blocks = [] |
| for bd, d in zip(block_dict_sources, distances): |
| b = Block().from_dict(bd) |
| b.distance = d |
| blocks.append(b) |
| |
| return blocks |
| |