Spaces:

Akshayram1
/

question_ans

Runtime error

App Files Files Community

Akshayram1 commited on Apr 24, 2024

Commit

cf566b6

verified ·

1 Parent(s): 7d640e1

Update app.py

Browse files

Files changed (1) hide show

app.py +8 -132

app.py CHANGED Viewed

@@ -1,13 +1,10 @@
 #!/usr/bin/env python3
-from langchain.chains import LLMChain
-from langchain.chains.summarize import load_summarize_chain
 from langchain.document_loaders import TextLoader, PyPDFLoader
-from langchain.llms import LlamaCpp
-from langchain.prompts import PromptTemplate
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 import gradio as gr
 import time
 VERBOSE = True
 MAX_TOKENS = 2048
@@ -35,29 +32,10 @@ LANGUAGES = ["Default", "English", "Polish", "Portuguese",
              "Spanish", "Czech", "Turkish", "French", "German", ]
 # Model params
-MODEL_FILE = "./models/mistral-7b-openorca.Q5_K_M.gguf"
-MODEL_CONTEXT_WINDOW = 8192
-# Chunk params in characters (not tokens)
-CHUNK_SIZE = 10000
-CHUNK_OVERLAP = 500
-llm = LlamaCpp(
-    model_path=MODEL_FILE,
-    n_ctx=MODEL_CONTEXT_WINDOW,
-    # Don't be creative.
-    temperature=0,
-    max_tokens=MAX_TOKENS,
-    verbose=VERBOSE,
-    # Remove next two lines if NOT using macOS & M1 processor:
-    n_batch=512,
-    n_gpu_layers=1,
-)
 combine_prompt_template = """
-Write a summary of the following text delimited by tripple backquotes.
 {style}
 ```{content}```
@@ -65,94 +43,8 @@ Write a summary of the following text delimited by tripple backquotes.
 {trigger} {in_language}:
 """
-map_prompt_template = """
-Write a concise summary of the following text which covers the main points and key facts and figures:
-{text}
-CONCISE SUMMARY {in_language}:
-"""
-def summarize_base(llm, content, style, language):
-    """Summarize whole content at once. The content needs to fit into model's context window."""
-    prompt = PromptTemplate.from_template(
-        combine_prompt_template
-    ).partial(
-        style=STYLES[style]["style"],
-        trigger=STYLES[style]["trigger"],
-        in_language=f"in {language}" if language != "Default" else "",
-    )
-    chain = LLMChain(llm=llm, prompt=prompt, verbose=VERBOSE)
-    output = chain.run(content)
-    return output
-def summarize_map_reduce(llm, content, style, language):
-    """Summarize content potentially larger that model's context window using map-reduce approach."""
-    text_splitter = RecursiveCharacterTextSplitter(
-        chunk_size=CHUNK_SIZE,
-        chunk_overlap=CHUNK_OVERLAP,
-    )
-    split_docs = text_splitter.create_documents([content])
-    print(
-        f"Map-Reduce content splits ({len(split_docs)} splits): {[len(sd.page_content) for sd in split_docs]}")
-    map_prompt = PromptTemplate.from_template(
-        map_prompt_template
-    ).partial(
-        in_language=f"in {language}" if language != "Default" else "",
-    )
-    combine_prompt = PromptTemplate.from_template(
-        combine_prompt_template
-    ).partial(
-        style=STYLES[style]["style"],
-        trigger=STYLES[style]["trigger"],
-        in_language=f"in {language}" if language != "Default" else "",
-    )
-    chain = load_summarize_chain(
-        llm=llm,
-        chain_type="map_reduce",
-        map_prompt=map_prompt,
-        combine_prompt=combine_prompt,
-        combine_document_variable_name="content",
-        verbose=VERBOSE,
-    )
-    output = chain.run(split_docs)
-    return output
-def load_input_file(input_file):
-    if not input_file:
-        return None
-    start_time = time.perf_counter()
-    if input_file.name.endswith(".pdf"):
-        loader = PyPDFLoader(input_file.name)
-        docs = loader.load()
-        end_time = time.perf_counter()
-        print(
-            f"PDF: loaded {len(docs)} pages, in {round(end_time - start_time, 1)} secs")
-        return "\n".join([d.page_content for d in docs])
-    docs = TextLoader(input_file.name).load()
-    end_time = time.perf_counter()
-    print(f"Input file load time {round(end_time - start_time, 1)} secs")
-    return docs[0].page_content
 def summarize_text(content, style, language, progress=gr.Progress()):
-    content_tokens = llm.get_num_tokens(content)
     print("Content length:", len(content))
     print("Content tokens:", content_tokens)
@@ -161,36 +53,20 @@ def summarize_text(content, style, language, progress=gr.Progress()):
     info = f"Content length: {len(content)} chars, {content_tokens} tokens."
     progress(None, desc=info)
-    # Keep part of context window for models output & some buffor for the promopt.
-    base_threshold = MODEL_CONTEXT_WINDOW - MAX_TOKENS - 256
     start_time = time.perf_counter()
-    if (content_tokens < base_threshold):
-        info += "\n"
-        info += "Using summarizer: base"
-        progress(None, desc=info)
-        print("Using summarizer: base")
-        summary = summarize_base(llm, content, style, language)
-    else:
-        info += "\n"
-        info += "Using summarizer: map-reduce"
-        progress(None, desc=info)
-        print("Using summarizer: map-reduce")
-        summary = summarize_map_reduce(llm, content, style, language)
     end_time = time.perf_counter()
     print("Summary length:", len(summary))
-    print("Summary tokens:", llm.get_num_tokens(summary))
     print("Summary:\n" + summary + "\n\n")
     info += "\n"
     info += f"Processing time: {round(end_time - start_time, 1)} secs."
     info += "\n"
-    info += f"Summary length: {llm.get_num_tokens(summary)} tokens."
     print("Info", info)
     return summary, info
@@ -266,4 +142,4 @@ with gr.Blocks() as ui:
     )
-ui.queue().launch(inbrowser=True)

 #!/usr/bin/env python3
 from langchain.document_loaders import TextLoader, PyPDFLoader
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 import gradio as gr
 import time
+from transformers import pipeline
 VERBOSE = True
 MAX_TOKENS = 2048
              "Spanish", "Czech", "Turkish", "French", "German", ]
 # Model params
+summarization_pipeline = pipeline("summarization", model="TheBloke/Mistral-7B-OpenOrca-GGUF")
 combine_prompt_template = """
+Write a summary of the following text delimited by triple backquotes.
 {style}
 ```{content}```
 {trigger} {in_language}:
 """
 def summarize_text(content, style, language, progress=gr.Progress()):
+    content_tokens = len(content.split())
     print("Content length:", len(content))
     print("Content tokens:", content_tokens)
     info = f"Content length: {len(content)} chars, {content_tokens} tokens."
     progress(None, desc=info)
     start_time = time.perf_counter()
+    summary = summarization_pipeline(content, max_length=2048, min_length=30, do_sample=False)[0]['summary_text']
     end_time = time.perf_counter()
     print("Summary length:", len(summary))
+    print("Summary tokens:", len(summary.split()))
     print("Summary:\n" + summary + "\n\n")
     info += "\n"
     info += f"Processing time: {round(end_time - start_time, 1)} secs."
     info += "\n"
+    info += f"Summary length: {len(summary.split())} tokens."
     print("Info", info)
     return summary, info
     )
+ui.queue().launch(inbrowser=True)