| | from langchain.chains.summarize import load_summarize_chain |
| | from langchain.text_splitter import CharacterTextSplitter |
| | from langchain import OpenAI, LLMChain, PromptTemplate |
| |
|
| | from datasets import Dataset |
| | import textwrap |
| | from tqdm import tqdm |
| | import os |
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| | template = 'Write a verbose summary of the following:\n\n\n"{text}"\n\nDo not omit any information. VERBOSE SUMMARY:\n\n\n' |
| | prompt = PromptTemplate(template=template, input_variables=["text"]) |
| | chain = LLMChain(prompt=prompt, llm=OpenAI(temperature=0), verbose=False) |
| |
|
| | def _summarize_func(chunk): |
| | chunk = chunk["chunk"] |
| | summary = chain.run(chunk) |
| | assert isinstance(summary, str) |
| | return dict(summary=summary) |
| |
|
| |
|
| | class RecursiveSummarizer(): |
| |
|
| | def __init__( |
| | self, |
| | ): |
| | self.splitter = CharacterTextSplitter.from_tiktoken_encoder(chunk_size=3900, chunk_overlap=0, separator=" ") |
| |
|
| | def _save_txt(self, string, lecture_number): |
| |
|
| | |
| | |
| | with open(f"./data/bab/lecture_{lecture_number}.txt", "w") as f: |
| | f.write(string) |
| | |
| | def summarize(self, text, n): |
| |
|
| | text_length = len(text) |
| | print("Initial Text length: ", text_length) |
| |
|
| | i = 0 |
| | while text_length > 18000: |
| | i += 1 |
| | print(f"Summarizing p{i}...") |
| | |
| | chunks = self.splitter.split_text(text) |
| | print(f"Number of chunks: {len(chunks)}") |
| |
|
| | |
| | ds = Dataset.from_list([{"chunk": chunk} for chunk in chunks]) |
| | summaries = ds.map(_summarize_func, num_proc=len(chunks), remove_columns=["chunk"])['summary'] |
| |
|
| | |
| | summary = " ".join(summaries) |
| |
|
| | text = summary |
| | text_length = len(text) |
| |
|
| | self._save_txt(text, lecture_number=n) |
| |
|
| | return text |
| | |
| |
|
| | if __name__ == "__main__": |
| | summarizer = RecursiveSummarizer() |
| | |
| | |
| | txtfiles = [f for f in os.listdir(".") if f.endswith(".txt") if f.startswith("lecture")] |
| |
|
| | for t in tqdm(txtfiles): |
| | |
| | n = t.split("_")[1].split(".")[0] |
| | print(f"Summarizing {t}...") |
| | |
| | with open(t, "r") as f: |
| | text = f.read() |
| |
|
| | summarizer.summarize(text, n) |