fvde commited on
Commit
9b2e531
·
1 Parent(s): 8e19587

Upload folder using huggingface_hub

Browse files
configuration/deployment.json CHANGED
@@ -1,6 +1,6 @@
1
  {
2
  "language_model_kwargs": {
3
- "model_name": "gpt-4",
4
  "temperature": 0.0
5
  },
6
  "summarization_kwargs": {
 
1
  {
2
  "language_model_kwargs": {
3
+ "model_name": "gpt-3.5-turbo-16k",
4
  "temperature": 0.0
5
  },
6
  "summarization_kwargs": {
configuration/example.json CHANGED
@@ -1,6 +1,6 @@
1
  {
2
  "language_model_kwargs": {
3
- "model_name": "gpt-3.5-turbo",
4
  "temperature": 0.0
5
  },
6
  "summarization_kwargs": {
 
1
  {
2
  "language_model_kwargs": {
3
+ "model_name": "gpt-3.5-turbo-16k",
4
  "temperature": 0.0
5
  },
6
  "summarization_kwargs": {
src/__pycache__/gradio_app.cpython-39.pyc CHANGED
Binary files a/src/__pycache__/gradio_app.cpython-39.pyc and b/src/__pycache__/gradio_app.cpython-39.pyc differ
 
src/__pycache__/prompts.cpython-39.pyc CHANGED
Binary files a/src/__pycache__/prompts.cpython-39.pyc and b/src/__pycache__/prompts.cpython-39.pyc differ
 
src/__pycache__/summarization.cpython-39.pyc CHANGED
Binary files a/src/__pycache__/summarization.cpython-39.pyc and b/src/__pycache__/summarization.cpython-39.pyc differ
 
src/gradio_app.py CHANGED
@@ -4,7 +4,7 @@ import pypdfium2 as pdfium
4
  import gradio as gr
5
 
6
  from langchain.chat_models import ChatOpenAI
7
- from src.summarization import summarize_wrapper
8
  from src.mailing import send_email
9
 
10
  # Function to render a specific page of a PDF file as an image
@@ -79,6 +79,7 @@ def run_summarization_model_gradio(
79
  summary_short = gr.Button("Kurze Zusammenfassung", interactive=False)
80
  summary_middle = gr.Button("Mittlere Zusammenfassung", interactive=False)
81
  summary_long = gr.Button("Lange Zusammenfassung", interactive=False)
 
82
  with gr.Row().style(equal_height=True):
83
  with gr.Column(scale=1):
84
  summary_output = gr.Textbox(label="Zusammenfassung", lines=9).style(
@@ -114,7 +115,14 @@ def run_summarization_model_gradio(
114
  [gr.State(True)],
115
  [summary_short, summary_middle, summary_long],
116
  queue=False,
117
- ).then(fn=render_file, inputs=[file_upload], outputs=[show_pdf])
 
 
 
 
 
 
 
118
 
119
  # If you click any button first disable all buttons, then summarzize and then enable the clicked button
120
  for s, summarization_type in [
@@ -149,6 +157,28 @@ def run_summarization_model_gradio(
149
  queue=False,
150
  )
151
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
152
  # The clear button clears the dashboard
153
  clear.click(lambda: None, None, summary_output, queue=False).then(
154
  lambda: None, None, file_upload, queue=False
 
4
  import gradio as gr
5
 
6
  from langchain.chat_models import ChatOpenAI
7
+ from src.summarization import summarize_wrapper, parallel_summarization
8
  from src.mailing import send_email
9
 
10
  # Function to render a specific page of a PDF file as an image
 
79
  summary_short = gr.Button("Kurze Zusammenfassung", interactive=False)
80
  summary_middle = gr.Button("Mittlere Zusammenfassung", interactive=False)
81
  summary_long = gr.Button("Lange Zusammenfassung", interactive=False)
82
+ summary_parallel = gr.Button("Parallele Zusammenfassung", interactive=False)
83
  with gr.Row().style(equal_height=True):
84
  with gr.Column(scale=1):
85
  summary_output = gr.Textbox(label="Zusammenfassung", lines=9).style(
 
115
  [gr.State(True)],
116
  [summary_short, summary_middle, summary_long],
117
  queue=False,
118
+ ).then(
119
+ switch_buttons,
120
+ [gr.State(True)],
121
+ [summary_parallel, gr.State(None), gr.State(None)],
122
+ queue=False,
123
+ ).then(
124
+ fn=render_file, inputs=[file_upload], outputs=[show_pdf]
125
+ )
126
 
127
  # If you click any button first disable all buttons, then summarzize and then enable the clicked button
128
  for s, summarization_type in [
 
157
  queue=False,
158
  )
159
 
160
+ summary_parallel.click(
161
+ switch_buttons,
162
+ [gr.State(False)],
163
+ [summary_short, summary_middle, summary_long],
164
+ queue=False,
165
+ ).then(
166
+ parallel_summarization,
167
+ [file_upload, gr.State([llm]), gr.State(summarization_kwargs)],
168
+ [summary_output],
169
+ queue=False,
170
+ ).then(
171
+ switch_buttons,
172
+ [gr.State(True)],
173
+ [summary_short, summary_middle, summary_long],
174
+ queue=False,
175
+ ).then(
176
+ switch_buttons,
177
+ [gr.State(True)],
178
+ [send_email_button, gr.State(None), gr.State(None)],
179
+ queue=False,
180
+ )
181
+
182
  # The clear button clears the dashboard
183
  clear.click(lambda: None, None, summary_output, queue=False).then(
184
  lambda: None, None, file_upload, queue=False
src/prompts.py CHANGED
@@ -150,3 +150,115 @@ Die Teile der Zusammenfassung mit Angabe der Seitenzahlen:
150
  ),
151
  },
152
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
150
  ),
151
  },
152
  }
153
+
154
+
155
+ def get_template_mp(name: str, headline: str, additional_text: str = ""):
156
+ base_multi = (
157
+ "Schreibe, ein/e <KEY> des Urteils, das durch dreifache Anführungszeichen begrenzt ist, in maximal einem Paragraphen.\n"
158
+ "<ADDITIONAL_TEXT>\n"
159
+ 'Als Überschrift muss "<HEAD_LINE>" angegeben werden. \n'
160
+ # "Nach dem Paragraph müssen die Seiten angegeben werden die genutzt wurden."
161
+ "Urteil:\n"
162
+ "```{text}```\n"
163
+ "\n"
164
+ "\n\nText:\n"
165
+ )
166
+ return (
167
+ base_multi.replace("<KEY>", name)
168
+ .replace("<HEAD_LINE>", headline)
169
+ .replace("<ADDITIONAL_TEXT>", additional_text)
170
+ )
171
+
172
+
173
+ prompts_parallel = {
174
+ "intro": PromptTemplate(
175
+ input_variables=["text"],
176
+ template=get_template_mp(name="Einleitung", headline="I. Einleitung"),
177
+ ),
178
+ "darstellung_des_rechtsproblems": PromptTemplate(
179
+ input_variables=["text"],
180
+ template=get_template_mp(
181
+ name="Darstellung des Rechtsproblems",
182
+ headline="Darstellung des Rechtsproblems",
183
+ ),
184
+ ),
185
+ "angaben_ueber_das_urteil": PromptTemplate(
186
+ input_variables=["text"],
187
+ template=get_template_mp(
188
+ name="Angaben über das Urteil",
189
+ headline="Angaben über das Urteil",
190
+ additional_text="Gib die folgenden Informationen an: Gericht, Datum, Aktenzeichen (AZ: ...), Fundstelle(n)",
191
+ ),
192
+ ),
193
+ "sachverhalt": PromptTemplate(
194
+ input_variables=["text"],
195
+ template=get_template_mp(
196
+ name="Sachverhalt",
197
+ headline="Sachverhalt (unter Rückgriff auf Instanzentscheidung)",
198
+ additional_text=(
199
+ "Beziehe dich auf die Instanzentscheidung.\n"
200
+ "Es soll nur der Sachverhalt des Urteils wiedergegeben werden. Wenn das Urteil keinen Sachverhalt hat schreib: 'Keine Informationen zum Sachverhalt vorhanden'."
201
+ ),
202
+ ),
203
+ ),
204
+ "prozessgeschichte": PromptTemplate(
205
+ input_variables=["text"],
206
+ template=get_template_mp(
207
+ name="Prozessgeschichte", headline="3. Prozessgeschichte"
208
+ ),
209
+ ),
210
+ "rechtsproblem": PromptTemplate(
211
+ input_variables=["text"],
212
+ template=get_template_mp(
213
+ name="Rechtsproblem",
214
+ headline="Rechtsproblem",
215
+ additional_text="Das Problem des Falles ist genau herauszuarbeiten und im rechtlichen Kontext zu verankern.",
216
+ ),
217
+ ),
218
+ "loesung_des_gerichts": PromptTemplate(
219
+ input_variables=["text"],
220
+ template=get_template_mp(
221
+ name="Lösung des Gerichts", headline="Lösung des Gerichts"
222
+ ),
223
+ ),
224
+ "loesungsansaetze_zum_problem": PromptTemplate(
225
+ input_variables=["text"],
226
+ template=get_template_mp(
227
+ name="Lösungsansätze zum Problem",
228
+ headline="Lösungsansätze zum Problem",
229
+ additional_text="Knappe, aber möglichst vollständige Übersicht der vertretenen Ansichten bzw. der Lösungsvorschläge im Urteil.",
230
+ ),
231
+ ),
232
+ "analyse_und_einordnung_der_entscheidung": PromptTemplate(
233
+ input_variables=["text"],
234
+ template=get_template_mp(
235
+ name="Analyse und Einordnung der Entscheidung",
236
+ headline="Analyse und Einordnung der Entscheidung",
237
+ additional_text="Es soll nur der Inhalt des Urteils wiedergegeben werden.",
238
+ ),
239
+ ),
240
+ "bewertung_und_kritik_der_entscheidung": PromptTemplate(
241
+ input_variables=["text"],
242
+ template=get_template_mp(
243
+ name="Bewertung und Kritik der Entscheidung",
244
+ headline="Bewertung und Kritik der Entscheidung",
245
+ additional_text="Verwende ausschließlich den Kontext des Urteils und schreib keinen neuen Text. Wenn keine Bewertung oder Kritik vorhanden ist, antworte mit 'Keine Bewertung oder Kritik vorhanden.'",
246
+ ),
247
+ ),
248
+ "eigener_loesungsvorschlag": PromptTemplate(
249
+ input_variables=["text"],
250
+ template=get_template_mp(
251
+ name="Eigener Lösungsvorschlag",
252
+ headline="Eigener Lösungsvorschlag",
253
+ additional_text="Es soll nur der Inhalt des Urteils wiedergegeben werden. Wenn das Urteil keinen eigenen Lösungsvorschlag hat schreib: 'Keine Informationen zum eigenen Lösungsvorschlag vorhanden'",
254
+ ),
255
+ ),
256
+ "ausblick": PromptTemplate(
257
+ input_variables=["text"],
258
+ template=get_template_mp(
259
+ name="Ausblick",
260
+ headline="Ausblick",
261
+ additional_text="Es soll nur der Inhalt des Urteils wiedergegeben werden. Wenn das Urteil keinen Ausblick gibt schreib: 'Keine Informationen zum Auslbick vorhanden'.",
262
+ ),
263
+ ),
264
+ }
src/summarization.py CHANGED
@@ -4,15 +4,18 @@ from langchain.chains.llm import LLMChain
4
  from langchain.chains.combine_documents.stuff import StuffDocumentsChain
5
  from langchain.chat_models import ChatOpenAI
6
  from langchain.docstore.document import Document
7
- from src.prompts import prompts
 
8
  from typing import Dict, List
 
9
 
10
 
11
- def load_docs(file_path: str) -> List[Document]:
12
  """Load a file and return the text.
13
 
14
  Args:
15
  file_path (str): Path to the pdf file. This can either be a local path or a tempfile.TemporaryFileWrapper_.
 
16
 
17
  Raises:
18
  ValueError: If the file type is not supported.
@@ -33,17 +36,15 @@ def load_docs(file_path: str) -> List[Document]:
33
  for doc in docs:
34
  doc.page_content = doc.page_content.replace("\n", " \n ")
35
  # if doc contains a page append it to the text
36
- if hasattr(doc, "metadata"):
37
- doc.page_content = (
38
- f"Start {doc.metadata.get('page')+1}"
39
- + doc.page_content
40
- + f" \n Ende Seite {doc.metadata.get('page')+1}"
41
  )
42
 
43
  return docs
44
 
45
 
46
- def summarize(
47
  file_path: str, llm: ChatOpenAI, summarization_kwargs: Dict[str, str]
48
  ) -> str:
49
  """Summarize a pdf file. The summarization is done by the language model.
@@ -109,6 +110,116 @@ def summarize_wrapper(
109
  else:
110
  raise ValueError(f"Summarization type {summarization_type} is not supported.")
111
 
112
- return summarize(
113
  file_path=file.name, llm=llm[0], summarization_kwargs=summarization_kwargs
114
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
  from langchain.chains.combine_documents.stuff import StuffDocumentsChain
5
  from langchain.chat_models import ChatOpenAI
6
  from langchain.docstore.document import Document
7
+ from src.prompts import prompts, prompts_parallel
8
+ import time
9
  from typing import Dict, List
10
+ import asyncio
11
 
12
 
13
+ def load_docs(file_path: str, with_pageinfo: bool = True) -> List[Document]:
14
  """Load a file and return the text.
15
 
16
  Args:
17
  file_path (str): Path to the pdf file. This can either be a local path or a tempfile.TemporaryFileWrapper_.
18
+ with_pageinfo (bool, optional): If True the page information is added to the document. Defaults to True.
19
 
20
  Raises:
21
  ValueError: If the file type is not supported.
 
36
  for doc in docs:
37
  doc.page_content = doc.page_content.replace("\n", " \n ")
38
  # if doc contains a page append it to the text
39
+ if with_pageinfo and hasattr(doc, "metadata"):
40
+ doc.page_content = f"(Quelle Seite: {doc.metadata.get('page')+1}) .".join(
41
+ doc.page_content.split(" .")
 
 
42
  )
43
 
44
  return docs
45
 
46
 
47
+ def summarize_chain(
48
  file_path: str, llm: ChatOpenAI, summarization_kwargs: Dict[str, str]
49
  ) -> str:
50
  """Summarize a pdf file. The summarization is done by the language model.
 
110
  else:
111
  raise ValueError(f"Summarization type {summarization_type} is not supported.")
112
 
113
+ return summarize_chain(
114
  file_path=file.name, llm=llm[0], summarization_kwargs=summarization_kwargs
115
  )
116
+
117
+
118
+ async def async_generate(
119
+ llm: ChatOpenAI, docs: List[Document], summarization_kwargs: dict, k: str
120
+ ) -> dict:
121
+ """Asyncronous summarization.
122
+
123
+ Args:
124
+ llm (ChatOpenAI): Language model to use for the summarization.
125
+ docs (List[Document]): List of documents.
126
+ summarization_kwargs (dict): Keyword arguments for the summarization.
127
+ k (str): Key for the summarization.
128
+
129
+ Returns:
130
+ dict: Dictionary with the summarization.
131
+ """
132
+ print(f"Starting summarization for {k}")
133
+ now = time.time()
134
+ # chain = load_summarize_chain(llm=llm, **summarization_kwargs)
135
+ chain = LLMChain(llm=llm, **summarization_kwargs)
136
+ resp = await chain.arun(text=docs)
137
+ print(f"Time taken for {k}: ", time.time() - now)
138
+ return {k: resp}
139
+
140
+
141
+ async def generate_concurrently(file_path: str, llm: ChatOpenAI) -> List[dict]:
142
+ """Parallel summarization.
143
+
144
+ Args:
145
+ file_path (str): Path to the pdf file. This can either be a local path or a tempfile.TemporaryFileWrapper_.
146
+ llm (ChatOpenAI): Language model to use for the summarization.
147
+
148
+ Returns:
149
+ List: List of summarizations.
150
+ """
151
+
152
+ docs = load_docs(file_path=file_path, with_pageinfo=False)
153
+ summarization_kwargs = dict()
154
+
155
+ # create parallel tasks
156
+ tasks = []
157
+ i = 0
158
+ for k, pt in prompts_parallel.items():
159
+ sk = summarization_kwargs.copy()
160
+ sk["prompt"] = pt
161
+ print(f"Appending task for {k}")
162
+ tasks.append(async_generate(llm=llm, docs=docs, summarization_kwargs=sk, k=k))
163
+ print("-------------------")
164
+ # execute all coroutines concurrently
165
+ values = await asyncio.gather(*tasks)
166
+
167
+ # report return values
168
+ values_flattened = {}
169
+ for v in values:
170
+ values_flattened.update(v)
171
+ return values_flattened
172
+
173
+
174
+ def parallel_summarization(
175
+ file: str, llm: ChatOpenAI, summarization_kwargs: dict
176
+ ) -> str:
177
+ """Wrapper for the summarization function to make it compatible with gradio.
178
+
179
+ Args:
180
+ file (str): Path to the file. This can either be a local path or a tempfile.TemporaryFileWrapper_.
181
+ llm (ChatOpenAI): Language model.
182
+ summarization_kwargs (dict): Keyword arguments for the summarization.
183
+
184
+ Returns:
185
+ str: Summarization of the file.
186
+ """
187
+ now = time.time()
188
+ values_flattened = asyncio.run(
189
+ generate_concurrently(file_path=file.name, llm=llm[0])
190
+ )
191
+ print("Time taken: ", time.time() - now)
192
+
193
+ output = f"""
194
+
195
+ {values_flattened["intro"]}
196
+
197
+ {values_flattened["darstellung_des_rechtsproblems"]}
198
+
199
+
200
+ II. Die Entscheidung
201
+
202
+ {values_flattened["angaben_ueber_das_urteil"]}
203
+
204
+ {values_flattened["sachverhalt"]}
205
+
206
+ {values_flattened["prozessgeschichte"]}
207
+
208
+ {values_flattened["rechtsproblem"]}
209
+
210
+ {values_flattened["loesung_des_gerichts"]}
211
+
212
+ III. Analyse
213
+
214
+ {values_flattened["loesungsansaetze_zum_problem"]}
215
+
216
+ {values_flattened["analyse_und_einordnung_der_entscheidung"]}
217
+
218
+ {values_flattened["bewertung_und_kritik_der_entscheidung"]}
219
+
220
+ {values_flattened["eigener_loesungsvorschlag"]}
221
+
222
+ {values_flattened["ausblick"]}
223
+ """
224
+
225
+ return output