achref Trabelsi commited on
Commit
8bda2ce
·
1 Parent(s): 36e1e41
Files changed (6) hide show
  1. README.md +38 -12
  2. app.py +83 -0
  3. core.py +57 -0
  4. file_operations.py +22 -0
  5. key_openai.txt +1 -0
  6. requirements.txt +7 -0
README.md CHANGED
@@ -1,12 +1,38 @@
1
- ---
2
- title: Neuro Internal Tools
3
- emoji: 🔥
4
- colorFrom: yellow
5
- colorTo: pink
6
- sdk: gradio
7
- sdk_version: 3.36.1
8
- app_file: app.py
9
- pinned: false
10
- ---
11
-
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Paper Reader Tool
2
+
3
+ Scientific papers are coming out TOO DAMN FAST so we need a way to very quickly extract useful information.
4
+
5
+ ## Repo Contents
6
+
7
+ - `chat.py` - this file is a simple chatbot that will chat with you about the contents of `input.txt` (you can copy/paste anything into this text file). Very useful to quickly discuss papers.
8
+ - `generate_multiple_reports.py` - this will consume all PDFs in the `input/` folder and generate summaries in the `output/` folder. This is helpful for bulk processing such as for literature reviews.
9
+ - `render_report.py` - this will render all the reports in `output/` to an *easier* to read file in `report.html`.
10
+ - `GUI.py` - this will make a Gradio Interface where you select a pdf file from your desktop and then get the answers.
11
+ ## EXECUTIVE SUMMARY
12
+
13
+ This repository contains Python scripts that automate the process of generating reports from PDF files using OpenAI's
14
+ GPT-4 model. The scripts extract text from PDF files, send the text to the GPT-4 model for processing, and save the
15
+ generated reports as text files. The scripts also include functionality to render the generated reports as an HTML
16
+ document and Gradio GUI for easy viewing.
17
+ ## SETUP
18
+
19
+ 1. Clone the repository to your local machine.
20
+ 2. Install the required Python packages by running `pip install -r requirements.txt` in your terminal.
21
+ 3. Obtain an API key from OpenAI and save it in a file named `key_openai.txt` in the root directory of the repository.
22
+ 4. Place the PDF files you want to generate reports from in the `input/` directory.
23
+
24
+ ## USAGE
25
+
26
+ 1. Run the `generate_multiple_reports.py` script to generate reports from the PDF files in the `input/` directory. The
27
+ generated reports will be saved as text files in the `output/` directory.
28
+ 2. Run the `render_report.py` script to render the generated reports as an HTML document. The HTML document will be
29
+ saved as `report.html` in the root directory of the repository.
30
+ 3. You can modify the `prompts` in `generate_multiple_reports.py` to focus on any questions you would like to ask. In other words you can automatically ask any set of questions in bulk against any set of papers. This can help you greatly accelerate your literature reviews and surveys.
31
+
32
+ ## NOTE
33
+
34
+ The scripts are designed to handle errors and retries when communicating with the OpenAI API. If the API returns an
35
+ error due to the maximum context length being exceeded, the scripts will automatically trim the oldest message and retry
36
+ the API call. If the API returns any other type of error, the scripts will retry the API call after a delay, with the
37
+ delay increasing exponentially for each consecutive error. If the API returns errors for seven consecutive attempts, the
38
+ scripts will stop and exit.
app.py ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import asyncio
3
+ from core import chatbot, chat_print, estimate_costs
4
+ import openai
5
+ import PyPDF2
6
+ from file_operations import open_file
7
+
8
+ prompts = [
9
+ "Can you give me a very clear explanation of the core assertions, implications, and mechanics mentioned in this paper?",
10
+ "Can you give a short summary with bullet points and key takeaways",
11
+ "Can you give me an analogy or metaphor that will help explain this to a broad audience.",
12
+ ]
13
+
14
+
15
+ async def process_prompt(prompt, ALL_MESSAGES, model):
16
+ report = ""
17
+ ALL_MESSAGES.append({"role": "user", "content": prompt})
18
+ response, tokens = await chatbot(ALL_MESSAGES, model)
19
+ chat_print(response)
20
+ ALL_MESSAGES.append({"role": "assistant", "content": response})
21
+ report += "\n\n\n\nQ: %s\n\nA: %s" % (prompt, response)
22
+ return report
23
+
24
+
25
+ async def process_pdf_content(text, prompts):
26
+ model = "gpt-3.5-turbo-16k"
27
+ if len(text) > 22000:
28
+ text = text[:22000]
29
+ model = "gpt-4-32k"
30
+ prompt_tokens = len(text) / 0.75
31
+ for p in prompts:
32
+ prompt_tokens += len(p) / 0.75
33
+ costs = estimate_costs(prompt_tokens, model)
34
+ print(costs)
35
+ if costs > 2:
36
+ return f"THIS IS WAY TO MUCH {costs}"
37
+ else:
38
+ ALL_MESSAGES = [{"role": "system", "content": text}]
39
+ prompt_tasks = [process_prompt(p, ALL_MESSAGES, model) for p in prompts]
40
+ results = await asyncio.gather(*prompt_tasks)
41
+ return " ".join(results).strip()
42
+
43
+
44
+ def process_pdf(pdf_file, prompt1, prompt2, prompt3):
45
+ # Open the PDF file
46
+ prompts = [
47
+ prompt for prompt in [prompt1, prompt2, prompt3] if prompt
48
+ ] # Only include prompts that are not empty
49
+ pdf_reader = PyPDF2.PdfReader(pdf_file)
50
+ paper = ""
51
+ for page_num in range(len(pdf_reader.pages)):
52
+ page = pdf_reader.pages[page_num]
53
+ paper += page.extract_text()
54
+ loop = asyncio.new_event_loop()
55
+ asyncio.set_event_loop(loop)
56
+ task = process_pdf_content(paper, prompts)
57
+ result = loop.run_until_complete(task)
58
+
59
+ return result
60
+
61
+
62
+ openai.api_key = open_file("key_openai.txt").strip()
63
+
64
+ iface = gr.Interface(
65
+ fn=process_pdf,
66
+ inputs=[
67
+ gr.inputs.File(),
68
+ gr.inputs.Textbox(
69
+ lines=2, placeholder="Enter Prompt 1 Here...", label="Prompt 1"
70
+ ),
71
+ gr.inputs.Textbox(
72
+ lines=2, placeholder="Enter Prompt 2 Here...", label="Prompt 2"
73
+ ),
74
+ gr.inputs.Textbox(
75
+ lines=2, placeholder="Enter Prompt 3 Here...", label="Prompt 3"
76
+ ),
77
+ ],
78
+ outputs="text",
79
+ title="Paper Analyser",
80
+ description="This tool analyse your academic papers and returns key findings",
81
+ )
82
+
83
+ iface.launch()
core.py ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ import textwrap
3
+
4
+ import openai
5
+ from halo import Halo
6
+
7
+ gpt_costs_per_thousand_out = {
8
+ "gpt-3.5-turbo-16k": 0.004,
9
+ "gpt-4-32k": 0.12,
10
+ }
11
+ gpt_costs_per_thousand_in = {
12
+ "gpt-3.5-turbo-16k": 0.003,
13
+ "gpt-4-32k": 0.06,
14
+ }
15
+
16
+
17
+ def estimate_costs(prompt_tokens, model: str):
18
+ costs = (prompt_tokens / 1000) * gpt_costs_per_thousand_in[model]
19
+ return costs
20
+
21
+
22
+ async def chatbot(conversation, model, temperature=0):
23
+ max_retry = 7
24
+ retry = 0
25
+ while True:
26
+ try:
27
+ spinner = Halo(text="Thinking...", spinner="dots")
28
+ spinner.start()
29
+
30
+ response = await openai.ChatCompletion.acreate(
31
+ model=model, messages=conversation, temperature=temperature
32
+ )
33
+ text = response["choices"][0]["message"]["content"]
34
+ spinner.stop()
35
+
36
+ return text, response["usage"]
37
+ except Exception as oops:
38
+ print(f'\n\nError communicating with OpenAI: "{oops}"')
39
+ if "maximum context length" in str(oops):
40
+ a = conversation.pop(0)
41
+ print("\n\n DEBUG: Trimming oldest message")
42
+ continue
43
+ retry += 1
44
+ if retry >= max_retry:
45
+ print(f"\n\nExiting due to excessive errors in API: {oops}")
46
+ exit(1)
47
+ print(f"\n\nRetrying in {2 ** (retry - 1) * 5} seconds...")
48
+ await asyncio.sleep(2 ** (retry - 1) * 5)
49
+
50
+
51
+ def chat_print(text):
52
+ formatted_lines = [
53
+ textwrap.fill(line, width=120, initial_indent=" ", subsequent_indent=" ")
54
+ for line in text.split("\n")
55
+ ]
56
+ formatted_text = "\n".join(formatted_lines)
57
+ print("\n\n\nCHATBOT:\n\n%s" % formatted_text)
file_operations.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import yaml
2
+
3
+
4
+ def save_file(filepath, content):
5
+ with open(filepath, "w", encoding="utf-8") as outfile:
6
+ outfile.write(content)
7
+
8
+
9
+ def open_file(filepath):
10
+ with open(filepath, "r", encoding="utf-8", errors="ignore") as infile:
11
+ return infile.read()
12
+
13
+
14
+ def save_yaml(filepath, data):
15
+ with open(filepath, "w", encoding="utf-8") as file:
16
+ yaml.dump(data, file, allow_unicode=True)
17
+
18
+
19
+ def open_yaml(filepath):
20
+ with open(filepath, "r", encoding="utf-8") as file:
21
+ data = yaml.load(file, Loader=yaml.FullLoader)
22
+ return data
key_openai.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ 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
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ bs4
2
+ halo
3
+ openai
4
+ pypdf2
5
+ pyyaml
6
+ gradio
7
+ cffi