Spaces:

lenox-ai
/

prototype

Runtime error

App Files Files Community

fvde commited on Aug 23, 2023

Commit

9b2e531

1 Parent(s): 8e19587

Upload folder using huggingface_hub

Browse files

Files changed (8) hide show

configuration/deployment.json +1 -1
configuration/example.json +1 -1
src/__pycache__/gradio_app.cpython-39.pyc +0 -0
src/__pycache__/prompts.cpython-39.pyc +0 -0
src/__pycache__/summarization.cpython-39.pyc +0 -0
src/gradio_app.py +32 -2
src/prompts.py +112 -0
src/summarization.py +120 -9

configuration/deployment.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
     "language_model_kwargs": {
-        "model_name": "gpt-4",
         "temperature": 0.0
     },
     "summarization_kwargs": {

 {
     "language_model_kwargs": {
+        "model_name": "gpt-3.5-turbo-16k",
         "temperature": 0.0
     },
     "summarization_kwargs": {

configuration/example.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
     "language_model_kwargs": {
-        "model_name": "gpt-3.5-turbo",
         "temperature": 0.0
     },
     "summarization_kwargs": {

 {
     "language_model_kwargs": {
+        "model_name": "gpt-3.5-turbo-16k",
         "temperature": 0.0
     },
     "summarization_kwargs": {

src/__pycache__/gradio_app.cpython-39.pyc CHANGED Viewed

Binary files a/src/__pycache__/gradio_app.cpython-39.pyc and b/src/__pycache__/gradio_app.cpython-39.pyc differ

src/__pycache__/prompts.cpython-39.pyc CHANGED Viewed

Binary files a/src/__pycache__/prompts.cpython-39.pyc and b/src/__pycache__/prompts.cpython-39.pyc differ

src/__pycache__/summarization.cpython-39.pyc CHANGED Viewed

Binary files a/src/__pycache__/summarization.cpython-39.pyc and b/src/__pycache__/summarization.cpython-39.pyc differ

src/gradio_app.py CHANGED Viewed

@@ -4,7 +4,7 @@ import pypdfium2 as pdfium
 import gradio as gr
 from langchain.chat_models import ChatOpenAI
-from src.summarization import summarize_wrapper
 from src.mailing import send_email
 # Function to render a specific page of a PDF file as an image
@@ -79,6 +79,7 @@ def run_summarization_model_gradio(
             summary_short = gr.Button("Kurze Zusammenfassung", interactive=False)
             summary_middle = gr.Button("Mittlere Zusammenfassung", interactive=False)
             summary_long = gr.Button("Lange Zusammenfassung", interactive=False)
         with gr.Row().style(equal_height=True):
             with gr.Column(scale=1):
                 summary_output = gr.Textbox(label="Zusammenfassung", lines=9).style(
@@ -114,7 +115,14 @@ def run_summarization_model_gradio(
             [gr.State(True)],
             [summary_short, summary_middle, summary_long],
             queue=False,
-        ).then(fn=render_file, inputs=[file_upload], outputs=[show_pdf])
         # If you click any button first disable all buttons, then summarzize and then enable the clicked button
         for s, summarization_type in [
@@ -149,6 +157,28 @@ def run_summarization_model_gradio(
                 queue=False,
             )
         # The clear button clears the dashboard
         clear.click(lambda: None, None, summary_output, queue=False).then(
             lambda: None, None, file_upload, queue=False

 import gradio as gr
 from langchain.chat_models import ChatOpenAI
+from src.summarization import summarize_wrapper, parallel_summarization
 from src.mailing import send_email
 # Function to render a specific page of a PDF file as an image
             summary_short = gr.Button("Kurze Zusammenfassung", interactive=False)
             summary_middle = gr.Button("Mittlere Zusammenfassung", interactive=False)
             summary_long = gr.Button("Lange Zusammenfassung", interactive=False)
+            summary_parallel = gr.Button("Parallele Zusammenfassung", interactive=False)
         with gr.Row().style(equal_height=True):
             with gr.Column(scale=1):
                 summary_output = gr.Textbox(label="Zusammenfassung", lines=9).style(
             [gr.State(True)],
             [summary_short, summary_middle, summary_long],
             queue=False,
+        ).then(
+            switch_buttons,
+            [gr.State(True)],
+            [summary_parallel, gr.State(None), gr.State(None)],
+            queue=False,
+        ).then(
+            fn=render_file, inputs=[file_upload], outputs=[show_pdf]
+        )
         # If you click any button first disable all buttons, then summarzize and then enable the clicked button
         for s, summarization_type in [
                 queue=False,
             )
+        summary_parallel.click(
+            switch_buttons,
+            [gr.State(False)],
+            [summary_short, summary_middle, summary_long],
+            queue=False,
+        ).then(
+            parallel_summarization,
+            [file_upload, gr.State([llm]), gr.State(summarization_kwargs)],
+            [summary_output],
+            queue=False,
+        ).then(
+            switch_buttons,
+            [gr.State(True)],
+            [summary_short, summary_middle, summary_long],
+            queue=False,
+        ).then(
+            switch_buttons,
+            [gr.State(True)],
+            [send_email_button, gr.State(None), gr.State(None)],
+            queue=False,
+        )
         # The clear button clears the dashboard
         clear.click(lambda: None, None, summary_output, queue=False).then(
             lambda: None, None, file_upload, queue=False

src/prompts.py CHANGED Viewed

@@ -150,3 +150,115 @@ Die Teile der Zusammenfassung mit Angabe der Seitenzahlen:
         ),
     },
 }

         ),
     },
 }
+def get_template_mp(name: str, headline: str, additional_text: str = ""):
+    base_multi = (
+        "Schreibe, ein/e <KEY> des Urteils, das durch dreifache Anführungszeichen begrenzt ist, in maximal einem Paragraphen.\n"
+        "<ADDITIONAL_TEXT>\n"
+        'Als Überschrift muss "<HEAD_LINE>" angegeben werden. \n'
+        # "Nach dem Paragraph müssen die Seiten angegeben werden die genutzt wurden."
+        "Urteil:\n"
+        "```{text}```\n"
+        "\n"
+        "\n\nText:\n"
+    )
+    return (
+        base_multi.replace("<KEY>", name)
+        .replace("<HEAD_LINE>", headline)
+        .replace("<ADDITIONAL_TEXT>", additional_text)
+    )
+prompts_parallel = {
+    "intro": PromptTemplate(
+        input_variables=["text"],
+        template=get_template_mp(name="Einleitung", headline="I. Einleitung"),
+    ),
+    "darstellung_des_rechtsproblems": PromptTemplate(
+        input_variables=["text"],
+        template=get_template_mp(
+            name="Darstellung des Rechtsproblems",
+            headline="Darstellung des Rechtsproblems",
+        ),
+    ),
+    "angaben_ueber_das_urteil": PromptTemplate(
+        input_variables=["text"],
+        template=get_template_mp(
+            name="Angaben über das Urteil",
+            headline="Angaben über das Urteil",
+            additional_text="Gib die folgenden Informationen an: Gericht, Datum, Aktenzeichen (AZ: ...), Fundstelle(n)",
+        ),
+    ),
+    "sachverhalt": PromptTemplate(
+        input_variables=["text"],
+        template=get_template_mp(
+            name="Sachverhalt",
+            headline="Sachverhalt (unter Rückgriff auf Instanzentscheidung)",
+            additional_text=(
+                "Beziehe dich auf die Instanzentscheidung.\n"
+                "Es soll nur der Sachverhalt des Urteils wiedergegeben werden. Wenn das Urteil keinen Sachverhalt hat schreib: 'Keine Informationen zum Sachverhalt vorhanden'."
+            ),
+        ),
+    ),
+    "prozessgeschichte": PromptTemplate(
+        input_variables=["text"],
+        template=get_template_mp(
+            name="Prozessgeschichte", headline="3. Prozessgeschichte"
+        ),
+    ),
+    "rechtsproblem": PromptTemplate(
+        input_variables=["text"],
+        template=get_template_mp(
+            name="Rechtsproblem",
+            headline="Rechtsproblem",
+            additional_text="Das Problem des Falles ist genau herauszuarbeiten und im rechtlichen Kontext zu verankern.",
+        ),
+    ),
+    "loesung_des_gerichts": PromptTemplate(
+        input_variables=["text"],
+        template=get_template_mp(
+            name="Lösung des Gerichts", headline="Lösung des Gerichts"
+        ),
+    ),
+    "loesungsansaetze_zum_problem": PromptTemplate(
+        input_variables=["text"],
+        template=get_template_mp(
+            name="Lösungsansätze zum Problem",
+            headline="Lösungsansätze zum Problem",
+            additional_text="Knappe, aber möglichst vollständige Übersicht der vertretenen Ansichten bzw. der Lösungsvorschläge im Urteil.",
+        ),
+    ),
+    "analyse_und_einordnung_der_entscheidung": PromptTemplate(
+        input_variables=["text"],
+        template=get_template_mp(
+            name="Analyse und Einordnung der Entscheidung",
+            headline="Analyse und Einordnung der Entscheidung",
+            additional_text="Es soll nur der Inhalt des Urteils wiedergegeben werden.",
+        ),
+    ),
+    "bewertung_und_kritik_der_entscheidung": PromptTemplate(
+        input_variables=["text"],
+        template=get_template_mp(
+            name="Bewertung und Kritik der Entscheidung",
+            headline="Bewertung und Kritik der Entscheidung",
+            additional_text="Verwende ausschließlich den Kontext des Urteils und schreib keinen neuen Text. Wenn keine Bewertung oder Kritik vorhanden ist, antworte mit 'Keine Bewertung oder Kritik vorhanden.'",
+        ),
+    ),
+    "eigener_loesungsvorschlag": PromptTemplate(
+        input_variables=["text"],
+        template=get_template_mp(
+            name="Eigener Lösungsvorschlag",
+            headline="Eigener Lösungsvorschlag",
+            additional_text="Es soll nur der Inhalt des Urteils wiedergegeben werden. Wenn das Urteil keinen eigenen Lösungsvorschlag hat schreib: 'Keine Informationen zum eigenen Lösungsvorschlag vorhanden'",
+        ),
+    ),
+    "ausblick": PromptTemplate(
+        input_variables=["text"],
+        template=get_template_mp(
+            name="Ausblick",
+            headline="Ausblick",
+            additional_text="Es soll nur der Inhalt des Urteils wiedergegeben werden. Wenn das Urteil keinen Ausblick gibt schreib: 'Keine Informationen zum Auslbick vorhanden'.",
+        ),
+    ),
+}

src/summarization.py CHANGED Viewed

@@ -4,15 +4,18 @@ from langchain.chains.llm import LLMChain
 from langchain.chains.combine_documents.stuff import StuffDocumentsChain
 from langchain.chat_models import ChatOpenAI
 from langchain.docstore.document import Document
-from src.prompts import prompts
 from typing import Dict, List
-def load_docs(file_path: str) -> List[Document]:
     """Load a file and return the text.
     Args:
         file_path (str): Path to the pdf file. This can either be a local path or a tempfile.TemporaryFileWrapper_.
     Raises:
         ValueError: If the file type is not supported.
@@ -33,17 +36,15 @@ def load_docs(file_path: str) -> List[Document]:
     for doc in docs:
         doc.page_content = doc.page_content.replace("\n", " \n ")
         # if doc contains a page append it to the text
-        if hasattr(doc, "metadata"):
-            doc.page_content = (
-                f"Start {doc.metadata.get('page')+1}"
-                + doc.page_content
-                + f" \n Ende Seite {doc.metadata.get('page')+1}"
             )
     return docs
-def summarize(
     file_path: str, llm: ChatOpenAI, summarization_kwargs: Dict[str, str]
 ) -> str:
     """Summarize a pdf file. The summarization is done by the language model.
@@ -109,6 +110,116 @@ def summarize_wrapper(
     else:
         raise ValueError(f"Summarization type {summarization_type} is not supported.")
-    return summarize(
         file_path=file.name, llm=llm[0], summarization_kwargs=summarization_kwargs
     )

 from langchain.chains.combine_documents.stuff import StuffDocumentsChain
 from langchain.chat_models import ChatOpenAI
 from langchain.docstore.document import Document
+from src.prompts import prompts, prompts_parallel
+import time
 from typing import Dict, List
+import asyncio
+def load_docs(file_path: str, with_pageinfo: bool = True) -> List[Document]:
     """Load a file and return the text.
     Args:
         file_path (str): Path to the pdf file. This can either be a local path or a tempfile.TemporaryFileWrapper_.
+        with_pageinfo (bool, optional): If True the page information is added to the document. Defaults to True.
     Raises:
         ValueError: If the file type is not supported.
     for doc in docs:
         doc.page_content = doc.page_content.replace("\n", " \n ")
         # if doc contains a page append it to the text
+        if with_pageinfo and hasattr(doc, "metadata"):
+            doc.page_content = f"(Quelle Seite: {doc.metadata.get('page')+1}) .".join(
+                doc.page_content.split(" .")
             )
     return docs
+def summarize_chain(
     file_path: str, llm: ChatOpenAI, summarization_kwargs: Dict[str, str]
 ) -> str:
     """Summarize a pdf file. The summarization is done by the language model.
     else:
         raise ValueError(f"Summarization type {summarization_type} is not supported.")
+    return summarize_chain(
         file_path=file.name, llm=llm[0], summarization_kwargs=summarization_kwargs
     )
+async def async_generate(
+    llm: ChatOpenAI, docs: List[Document], summarization_kwargs: dict, k: str
+) -> dict:
+    """Asyncronous summarization.
+    Args:
+        llm (ChatOpenAI): Language model to use for the summarization.
+        docs (List[Document]): List of documents.
+        summarization_kwargs (dict): Keyword arguments for the summarization.
+        k (str): Key for the summarization.
+    Returns:
+        dict: Dictionary with the summarization.
+    """
+    print(f"Starting summarization for {k}")
+    now = time.time()
+    # chain = load_summarize_chain(llm=llm, **summarization_kwargs)
+    chain = LLMChain(llm=llm, **summarization_kwargs)
+    resp = await chain.arun(text=docs)
+    print(f"Time taken for {k}: ", time.time() - now)
+    return {k: resp}
+async def generate_concurrently(file_path: str, llm: ChatOpenAI) -> List[dict]:
+    """Parallel summarization.
+    Args:
+        file_path (str): Path to the pdf file. This can either be a local path or a tempfile.TemporaryFileWrapper_.
+        llm (ChatOpenAI): Language model to use for the summarization.
+    Returns:
+        List: List of summarizations.
+    """
+    docs = load_docs(file_path=file_path, with_pageinfo=False)
+    summarization_kwargs = dict()
+    # create parallel tasks
+    tasks = []
+    i = 0
+    for k, pt in prompts_parallel.items():
+        sk = summarization_kwargs.copy()
+        sk["prompt"] = pt
+        print(f"Appending task for {k}")
+        tasks.append(async_generate(llm=llm, docs=docs, summarization_kwargs=sk, k=k))
+    print("-------------------")
+    # execute all coroutines concurrently
+    values = await asyncio.gather(*tasks)
+    # report return values
+    values_flattened = {}
+    for v in values:
+        values_flattened.update(v)
+    return values_flattened
+def parallel_summarization(
+    file: str, llm: ChatOpenAI, summarization_kwargs: dict
+) -> str:
+    """Wrapper for the summarization function to make it compatible with gradio.
+    Args:
+        file (str): Path to the file. This can either be a local path or a tempfile.TemporaryFileWrapper_.
+        llm (ChatOpenAI): Language model.
+        summarization_kwargs (dict): Keyword arguments for the summarization.
+    Returns:
+        str: Summarization of the file.
+    """
+    now = time.time()
+    values_flattened = asyncio.run(
+        generate_concurrently(file_path=file.name, llm=llm[0])
+    )
+    print("Time taken: ", time.time() - now)
+    output = f"""
+{values_flattened["intro"]}
+{values_flattened["darstellung_des_rechtsproblems"]}
+II.  Die Entscheidung
+{values_flattened["angaben_ueber_das_urteil"]}
+{values_flattened["sachverhalt"]}
+{values_flattened["prozessgeschichte"]}
+{values_flattened["rechtsproblem"]}
+{values_flattened["loesung_des_gerichts"]}
+III.  Analyse
+{values_flattened["loesungsansaetze_zum_problem"]}
+{values_flattened["analyse_und_einordnung_der_entscheidung"]}
+{values_flattened["bewertung_und_kritik_der_entscheidung"]}
+{values_flattened["eigener_loesungsvorschlag"]}
+{values_flattened["ausblick"]}
+"""
+    return output