Spaces:
Paused
Paused
| from datetime import datetime, timezone | |
| from langchain.prompts import ChatPromptTemplate | |
| from langchain_core.prompts import ChatPromptTemplate | |
| from src.utils.api_key_manager import APIKeyManager, with_api_manager | |
| from src.query_processing.late_chunking.late_chunker import LateChunker | |
| class Reasoner: | |
| def __init__(self): | |
| self.manager = APIKeyManager() | |
| self.model = self.manager.get_llm() | |
| async def answer( | |
| self, | |
| query, | |
| context=None, | |
| query_type="general", | |
| *, | |
| llm | |
| ): | |
| if context is None: | |
| template = \ | |
| """You are an AI model skilled in web search and crafting detailed, engaging, and well-structured answers. | |
| You excel at summarizing web pages and extracting relevant information to create professional, blog-style responses. | |
| Your task is to provide answers that are: | |
| - **Informative and relevant**: Thoroughly address the user's query. | |
| - **Well-structured**: Include clear headings and subheadings, and use a professional tone to present information concisely and logically. | |
| - **Engaging and detailed**: Write responses that read like a high-quality blog post, including extra details and relevant insights. | |
| - **Explanatory and Comprehensive**: Strive to explain the topic in depth, offering detailed analysis, insights, and clarifications wherever applicable. | |
| ### Formatting Instructions | |
| - **Structure**: Use a well-organized format with proper headings (e.g., "## Example heading 1" or "## Example heading 2"). | |
| Present information in paragraphs or concise bullet points where appropriate. | |
| - **Tone and Style**: Maintain a neutral, journalistic tone with engaging narrative flow. | |
| Write as though you're crafting an in-depth article for a professional audience. | |
| - **Markdown Usage**: Format your response with Markdown for clarity. Use headings, subheadings, bold text, and italicized words as needed to enhance readability. | |
| - **Length and Depth**: Provide comprehensive coverage of the topic. Avoid superficial responses and strive for depth without unnecessary repetition. | |
| Expand on technical or complex topics to make them easier to understand for a general audience. | |
| - **No main heading/title**: Start your response directly with the introduction unless asked to provide a specific title. | |
| - **Conclusion or Summary**: Include a concluding paragraph that synthesizes the provided information or suggests potential next steps, where appropriate. | |
| ### Special Instructions | |
| - If the query involves technical, historical, or complex topics, provide detailed background and explanatory sections to ensure clarity. | |
| - If the user provides vague input or if relevant information is missing, explain what additional details might help refine the search. | |
| - If no relevant information is found, say: | |
| "Hmm, sorry I could not find any relevant information on this topic. Would you like me to search again or ask something else?" | |
| Be transparent about limitations and suggest alternatives or ways to reframe the query. | |
| ### User instructions | |
| - These instructions are shared to you by the user as part of the query itself. | |
| - You will have to follow them and give them higher priority than the above instructions. | |
| - If the user has provided specific instructions or preferences, incorporate them into your response while adhering to the overall guidelines. | |
| - If no instructions are provided, follow the general guidelines and instructions above. | |
| ### Example Output | |
| - Begin with a brief introduction summarizing the event or query topic. | |
| - Follow with detailed sections under clear headings, covering all aspects of the query if possible. | |
| - Provide explanations or historical context as needed to enhance understanding. | |
| - End with a conclusion or overall perspective if relevant. | |
| Query: | |
| {query} | |
| Current date & time in ISO format (UTC timezone): {date}""" | |
| prompt = ChatPromptTemplate.from_template(template) | |
| messages = prompt.format_messages(query=query, date=datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S')) | |
| elif query_type == "basic" and "[USER PROVIDED" in context: | |
| template = \ | |
| """You are an AI model skilled in web search and crafting detailed, engaging, and well-structured answers. | |
| You excel at summarizing web pages and extracting relevant information to create professional, blog-style responses. | |
| Your task is to provide answers that are: | |
| - **Informative and relevant**: Thoroughly address the user's query. | |
| - **Well-structured**: Include clear headings and subheadings, and use a professional tone to present information concisely and logically. | |
| - **Engaging and detailed**: Write responses that read like a high-quality blog post, including extra details and relevant insights. | |
| - **Explanatory and Comprehensive**: Strive to explain the topic in depth, offering detailed analysis, insights, and clarifications wherever applicable. | |
| ### Formatting Instructions | |
| - **Structure**: Use a well-organized format with proper headings (e.g., "## Example heading 1" or "## Example heading 2"). | |
| Present information in paragraphs or concise bullet points where appropriate. | |
| - **Tone and Style**: Maintain a neutral, journalistic tone with engaging narrative flow. | |
| Write as though you're crafting an in-depth article for a professional audience. | |
| - **Markdown Usage**: Format your response with Markdown for clarity. Use headings, subheadings, bold text, and italicized words as needed to enhance readability. | |
| - **Length and Depth**: Provide comprehensive coverage of the topic. Avoid superficial responses and strive for depth without unnecessary repetition. | |
| Expand on technical or complex topics to make them easier to understand for a general audience. | |
| - **No main heading/title**: Start your response directly with the introduction unless asked to provide a specific title. | |
| - **Conclusion or Summary**: Include a concluding paragraph that synthesizes the provided information or suggests potential next steps, where appropriate. | |
| ### Special Instructions | |
| - If the query involves technical, historical, or complex topics, provide detailed background and explanatory sections to ensure clarity. | |
| - If the user provides vague input or if relevant information is missing, explain what additional details might help refine the search. | |
| - All user-provided files and/or links must be given higher priority to those sources when crafting the response. | |
| - If no relevant information is found, say: | |
| "Hmm, sorry I could not find any relevant information on this topic. Would you like me to search again or ask something else?" | |
| Be transparent about limitations and suggest alternatives or ways to reframe the query. | |
| ### User instructions | |
| - These instructions are shared to you by the user as part of the query itself. | |
| - You will have to follow them and give them higher priority than the above instructions. | |
| - If the user has provided specific instructions or preferences, incorporate them into your response while adhering to the overall guidelines. | |
| - If no instructions are provided, follow the general guidelines and instructions above. | |
| ### Example Output | |
| - Begin with a brief introduction summarizing the event or query topic. | |
| - Follow with detailed sections under clear headings, covering all aspects of the query if possible. | |
| - Provide explanations or historical context as needed to enhance understanding. | |
| - End with a conclusion or overall perspective if relevant. | |
| Context: | |
| {context} | |
| Query: | |
| {query} | |
| Current date & time in ISO format (UTC timezone): {date}""" | |
| prompt = ChatPromptTemplate.from_template(template) | |
| messages = prompt.format_messages(context=context, query=query, date=datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S')) | |
| else: | |
| template = \ | |
| """You are an AI model skilled in web search and crafting detailed, engaging, and well-structured answers. | |
| You excel at summarizing web pages and extracting relevant information to create professional, blog-style responses. | |
| Your task is to provide answers that are: | |
| - **Informative and relevant**: Thoroughly address the user's query using the given context. | |
| - **Well-structured**: Include clear headings and subheadings, and use a professional tone to present information concisely and logically. | |
| - **Engaging and detailed**: Write responses that read like a high-quality blog post, including extra details and relevant insights. | |
| - **Cited and credible**: Use inline citations with [number] notation to refer to the context source(s) for each fact or detail included. | |
| - **Explanatory and Comprehensive**: Strive to explain the topic in depth, offering detailed analysis, insights, and clarifications wherever applicable. | |
| ### Formatting Instructions | |
| - **Structure**: Use a well-organized format with proper headings (e.g., "## Example heading 1" or "## Example heading 2"). | |
| Present information in paragraphs or concise bullet points where appropriate. | |
| - **Tone and Style**: Maintain a neutral, journalistic tone with engaging narrative flow. | |
| Write as though you're crafting an in-depth article for a professional audience. | |
| - **Markdown Usage**: Format your response with Markdown for clarity. Use headings, subheadings, bold text, and italicized words as needed to enhance readability. | |
| - **Length and Depth**: Provide comprehensive coverage of the topic. Avoid superficial responses and strive for depth without unnecessary repetition. | |
| Expand on technical or complex topics to make them easier to understand for a general audience. | |
| - **No main heading/title**: Start your response directly with the introduction unless asked to provide a specific title. | |
| - **Conclusion or Summary**: Include a concluding paragraph that synthesizes the provided information or suggests potential next steps, where appropriate. | |
| ### [IMPORTANT] Citation Requirements | |
| - Cite every single fact, statement, or sentence using [number] notation corresponding to the source from the provided `context`. | |
| Each source in the `context` will be in the following format, where N is the source number:- | |
| [SOURCE N START] | |
| source content... | |
| [SOURCE N END] | |
| - Integrate citations naturally at the end of sentences or clauses as appropriate. | |
| For example, "The Eiffel Tower is one of the most visited landmarks in the world[1]." | |
| - [IMPORTANT] If applicable, use multiple sources for a single detail, such as, "Paris is a cultural hub, attracting millions of visitors annually[1][2]." | |
| *DO NOT* use two numbers in the same citation marker, e.g., [1,2] is *NOT* valid. | |
| - Always prioritize credibility and accuracy by linking all statements back to their respective context sources. | |
| - Avoid citing unsupported assumptions or personal interpretations; if no source supports a statement, clearly indicate the limitation. | |
| ### Special Instructions | |
| - If the query involves technical, historical, or complex topics, provide detailed background and explanatory sections to ensure clarity. | |
| - If the user provides vague input or if relevant information is missing, explain what additional details might help refine the search. | |
| - If the context contains any user-provided files and/or links, ensure to give higher priority to those sources when crafting the response. | |
| - If no relevant information is found, say: | |
| "Hmm, sorry I could not find any relevant information on this topic. Would you like me to search again or ask something else?" | |
| Be transparent about limitations and suggest alternatives or ways to reframe the query. | |
| ### User instructions | |
| - These instructions are shared to you by the user as part of the query itself. | |
| - You will have to follow them and give them higher priority than the above instructions. | |
| - If the user has provided specific instructions or preferences, incorporate them into your response while adhering to the overall guidelines. | |
| - If no instructions are provided, follow the general guidelines and instructions above. | |
| ### Example Output | |
| - Begin with a brief introduction summarizing the event or query topic. | |
| - Follow with detailed sections under clear headings, covering all aspects of the query if possible. | |
| - Provide explanations or historical context as needed to enhance understanding. | |
| - End with a conclusion or overall perspective if relevant. | |
| Context: | |
| {context} | |
| Query: | |
| {query} | |
| Current date & time in ISO format (UTC timezone): {date}""" | |
| prompt = ChatPromptTemplate.from_template(template) | |
| messages = prompt.format_messages(context=context, query=query, date=datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S')) | |
| try: | |
| async for chunk in llm.astream(messages): | |
| yield chunk.content | |
| except Exception as e: | |
| raise e | |
| async def summarize( | |
| self, | |
| query, | |
| content, | |
| model_name="minishlab/potion-base-8M", | |
| max_chunk_length=1000, | |
| max_tokens_allowed=None, | |
| overlap=200, | |
| *, | |
| llm | |
| ): | |
| if max_tokens_allowed: | |
| late_chunker = LateChunker(model_name=model_name) | |
| content_tokens = self.model.get_num_tokens(content) | |
| if content_tokens > max_tokens_allowed: | |
| print("Content is too long, applying late chunking...") | |
| content = await late_chunker.chunker( | |
| text=content, | |
| query=query, | |
| max_chunk_length=max_chunk_length, | |
| max_tokens=max_tokens_allowed, | |
| overlap=overlap | |
| ) | |
| template= \ | |
| """You are an expert at summarizing long documents. | |
| Your task is to create a concise but detailed summary of documents that ultimately lead to detailed and precise answers to the queries. | |
| Rules: | |
| 1. The summary should be concise but detailed, precise and accurate. | |
| 2. Focus on extracting key information, facts, and data that are directly relevant to the query. | |
| 3. Include specific details, numbers, and quotes when they are important. | |
| 4. Ensure that your summary preserves the original meaning and context of the information. | |
| Your response should ONLY be the detailed summary of documents in plain text without any formatting. | |
| Query: | |
| {query} | |
| Document: | |
| {content}""" | |
| prompt = ChatPromptTemplate.from_template(template) | |
| messages = prompt.format_messages(content=content, query=query) | |
| response = await llm.ainvoke(messages) | |
| return response.content.strip() | |
| async def get_excerpts( | |
| self, | |
| answer_text, | |
| source_docs, | |
| *, | |
| llm | |
| ): | |
| template= \ | |
| """You are an expert at generating excerpts from long documents. | |
| Your task is to find and extract the most relevant, contiguous sentence(s) or short passage from the Source Documents that directly supports the Answer Text. | |
| The Source Documents are formatted with markers like [SOURCE N START] and [SOURCE N END], where N is the source number. | |
| The Answer Text uses citation markers like [N], where N directly corresponds to the source number N in the Source Documents. | |
| In case of multiple citations, the Answer Text's citation markers will be like [N][M][...etc] (or in some cases, [N, M, ...etc]). | |
| [IMPORTANT] Rules: | |
| 1. You must carefully read and analyse the Answer Text and the Source Documents. | |
| 2. The excerpts should be concise but detailed, precise and accurate. | |
| 3. Focus on extracting key information, facts, and data that are directly relevant to the answer. | |
| 4. Include specific details, numbers, and quotes when they are important. | |
| 5. Ensure the excerpts are verbatim and extracted directly from the context without any paraphrasing or alteration. | |
| 6. Your output should be a valid python list as shown in the output format below. | |
| 7. If you cannot find any relevant excerpts, say "Excerpt not found". | |
| Output Format: | |
| [ | |
| {{<statement 1>: {{<source number>: <extracted excerpt 1>, | |
| <source number>: <extracted excerpt 2>, | |
| and so on...}} | |
| }}, | |
| {{<statement 2>: {{<source number>: <extracted excerpt 1>, | |
| <source number>: <extracted excerpt 2>, | |
| and so on...}} | |
| }}, | |
| ...and so on | |
| ] | |
| Example Output: | |
| [ | |
| {{"The Treaty of Waitangi is a foundational document in New Zealand's history.": {{ | |
| 1: "The Treaty of Waitangi, signed in 1840, is considered the founding document of New Zealand." | |
| }} | |
| }}, | |
| {{"Signed in 1840, the principles of the Treaty are often debated.": {{ | |
| 1: "The Treaty of Waitangi, signed in 1840, is considered the founding document of New Zealand.", | |
| 2: "The principles of the Treaty are often debated in legal and political contexts." | |
| }} | |
| }}, | |
| {{"The Treaty can arguably lead to a civil war in New Zealand.": {{ | |
| "NA": "Excerpt not found" | |
| }} | |
| }} | |
| ] | |
| Source Documents: | |
| {source_docs} | |
| Answer Text: | |
| {answer_text}""" | |
| prompt = ChatPromptTemplate.from_template(template) | |
| messages = prompt.format_messages(answer_text=answer_text, source_docs=source_docs) | |
| response = await llm.ainvoke(messages) | |
| return response.content.strip() | |
| if __name__ == "__main__": | |
| import asyncio | |
| from src.crawl.crawler import Crawler | |
| reasoner = Reasoner() | |
| crawler = Crawler() | |
| session_id = crawler.create_session() | |
| contents = asyncio.run(crawler.crawl_with_retry( | |
| "https://www.parliament.nz/en/pb/sc/make-a-submission/document/54SCJUST_SCF_227E6D0B-E632-42EB-CFFE-08DCFEB826C6/principles-of-the-treaty-of-waitangi-bill", | |
| session_id=session_id, | |
| rotate_proxy=False, | |
| return_html=True | |
| )) | |
| print(contents) | |