File size: 5,626 Bytes
92f3006 c6d3d04 2f12302 799b85e 9b2e531 92f3006 9b2e531 92f3006 9b2e531 92f3006 51fbfa6 2f12302 51fbfa6 9b2e531 51fbfa6 9b2e531 2f12302 9b2e531 27b17df 9b2e531 2f12302 c6d3d04 9b2e531 2f12302 9b2e531 2f12302 9b2e531 799b85e 9b2e531 2f12302 9b2e531 799b85e 2f12302 799b85e 2f12302 9b2e531 2f12302 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 | from langchain.chains.summarize import load_summarize_chain
from langchain.chat_models import ChatOpenAI
from src.prompts import (
prompts,
prompts_parallel_summary,
)
from src.doc_loading import load_docs
from src.llm_utils import async_generate_llmchain
import time
from typing import Dict, List
import asyncio
def summarize_chain(
file_path: str, llm: ChatOpenAI, summarization_kwargs: Dict[str, str]
) -> str:
"""Summarize a pdf file. The summarization is done by the language model.
Args:
file_path (str): Path to the pdf file. This can either be a local path or a tempfile.TemporaryFileWrapper_.
llm (ChatOpenAI): Language model to use for the summarization.
Returns:
str: Summarization of the pdf file.
"""
docs = load_docs(file_path=file_path)
chain = load_summarize_chain(
llm=llm,
**summarization_kwargs,
)
summary = chain.run(docs)
return summary
def summarize_wrapper(
file: str, llm: ChatOpenAI, summarization_type: str, summarization_kwargs: dict
) -> str:
"""Wrapper for the summarization function to make it compatible with gradio. This function uses a
single summarization chain.
Args:
file (str): Path to the file. This can either be a local path or a tempfile.TemporaryFileWrapper_.
llm (ChatOpenAI): Language model.
summarization_type (str): Type of summarization. Can be either "short", "middle" or "long".
summarization_kwargs (dict): Keyword arguments for the summarization.
Returns:
str: Summarization of the file.
"""
if summarization_type == "short":
summarization_kwargs.update(
dict(
map_prompt=prompts["short_de"]["map_prompt"],
combine_prompt=prompts["short_de"]["combine_prompt"],
)
)
elif summarization_type == "middle":
summarization_kwargs.update(
dict(
map_prompt=prompts["middle_de"]["map_prompt"],
combine_prompt=prompts["middle_de"]["combine_prompt"],
)
)
elif summarization_type == "long":
summarization_kwargs.update(
dict(
map_prompt=prompts["long_de"]["map_prompt"],
combine_prompt=prompts["long_de"]["combine_prompt"],
)
)
else:
raise ValueError(f"Summarization type {summarization_type} is not supported.")
return summarize_chain(
file_path=file.name, llm=llm[0], summarization_kwargs=summarization_kwargs
)
async def generate_summary_concurrently(
file_path: str, sections: List[str], llm: ChatOpenAI
) -> List[dict]:
"""Parallel summarization. This function is used to run different prompts for the same docs in parallel.
Args:
file_path (str): Path to the pdf file. This can either be a local path or a tempfile.TemporaryFileWrapper_.
sections (List[str]): List of sections to summarize selected by the user.
llm (ChatOpenAI): Language model to use for the summarization.
Returns:
List: List of summarizations.
"""
docs = load_docs(file_path=file_path, with_pageinfo=False)
summarization_kwargs = dict()
# create parallel tasks
tasks = []
for k in PARALLEL_SUMMARIZATION_ORDER:
if PARALLEL_SUMMARIZATION_MAPPING_INVERSE.get(k, k) in sections:
sk = summarization_kwargs.copy()
sk["prompt"] = prompts_parallel_summary[k]
print(f"Appending task for summary: {k}")
tasks.append(
async_generate_llmchain(llm=llm, docs=docs, llm_kwargs=sk, k=k)
)
print("-------------------")
# execute all coroutines concurrently
values = await asyncio.gather(*tasks)
# report return values
values_flattened = {}
for v in values:
values_flattened.update(v)
return values_flattened
PARALLEL_SUMMARIZATION_ORDER = [
"intro",
"darstellung_des_rechtsproblems",
"II. Die Entscheidung",
"angaben_ueber_das_urteil",
"sachverhalt",
"prozessgeschichte",
"rechtsproblem",
"loesung_des_gerichts",
]
PARALLEL_SUMMARIZATION_MAPPING = {
"I. Einleitung": "intro",
"Darstellung des Rechtsproblems": "darstellung_des_rechtsproblems",
"Angaben über das Urteil": "angaben_ueber_das_urteil",
"Sachverhalt": "sachverhalt",
"Prozessgeschichte": "prozessgeschichte",
"Rechtsproblem": "rechtsproblem",
"Lösung des Gerichts": "loesung_des_gerichts",
}
PARALLEL_SUMMARIZATION_MAPPING_INVERSE = {
v: k for k, v in PARALLEL_SUMMARIZATION_MAPPING.items()
}
def parallel_summarization(file: str, sections: List[str], llm: ChatOpenAI) -> str:
"""Wrapper for the parallel summarization function to make it compatible with gradio.
Args:
file (str): Path to the file. This can either be a local path or a tempfile.TemporaryFileWrapper_.
sections (List[str]): List of sections to summarize.
llm (ChatOpenAI): Language model.
Returns:
str: Summarization of the file.
"""
now = time.time()
values_flattened = asyncio.run(
generate_summary_concurrently(
file_path=file.name, sections=sections, llm=llm[0]
)
)
print("Time taken for complete parallel summarization: ", time.time() - now)
output = ""
for section in values_flattened.keys():
output += (
values_flattened.get(
section, PARALLEL_SUMMARIZATION_MAPPING_INVERSE.get(section, section)
)
+ "\n\n"
)
return output
|