Update app.py
Browse files
app.py
CHANGED
|
@@ -4,7 +4,8 @@ import fitz # PyMuPDF
|
|
| 4 |
from PIL import Image
|
| 5 |
from pathlib import Path
|
| 6 |
import os
|
| 7 |
-
|
|
|
|
| 8 |
|
| 9 |
api_key = os.getenv('API_KEY')
|
| 10 |
base_url = os.getenv("BASE_URL")
|
|
@@ -15,6 +16,60 @@ client = OpenAI(
|
|
| 15 |
)
|
| 16 |
|
| 17 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 18 |
def extract_pdf_pypdf(pdf_dir):
|
| 19 |
try:
|
| 20 |
doc = fitz.open(pdf_dir)
|
|
@@ -52,7 +107,54 @@ def openai_api(messages):
|
|
| 52 |
return None
|
| 53 |
|
| 54 |
|
| 55 |
-
def
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 56 |
if pdf_file is None:
|
| 57 |
return "Please upload a PDF file to proceed."
|
| 58 |
|
|
@@ -66,9 +168,17 @@ def predict(input_text, pdf_file):
|
|
| 66 |
'''
|
| 67 |
{{""" + file_content + """}}
|
| 68 |
'''
|
| 69 |
-
""" +
|
| 70 |
]
|
| 71 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 72 |
|
| 73 |
return extract_result or "Too many users. Please wait a moment!"
|
| 74 |
|
|
@@ -147,8 +257,10 @@ with gr.Blocks(title="Automated Enzyme Kinetics Extractor") as demo:
|
|
| 147 |
with gr.Row():
|
| 148 |
viewer_button = gr.Button("View PDF", variant="secondary")
|
| 149 |
extract_button = gr.Button("Extract Text", variant="primary")
|
|
|
|
| 150 |
with gr.Row():
|
| 151 |
with gr.Column(scale=1):
|
|
|
|
| 152 |
file_out = gr.Gallery(label="PDF Viewer", columns=1, height="auto", object_fit="contain")
|
| 153 |
|
| 154 |
with gr.Column(scale=1):
|
|
@@ -167,7 +279,7 @@ with gr.Blocks(title="Automated Enzyme Kinetics Extractor") as demo:
|
|
| 167 |
model_input = gr.Textbox(lines=7, value=en_1, placeholder='Enter your extraction prompt here', label='Input Prompt')
|
| 168 |
exp = gr.Button("Example Prompt")
|
| 169 |
with gr.Row():
|
| 170 |
-
gen = gr.Button("Generate")
|
| 171 |
clr = gr.Button("Clear")
|
| 172 |
outputs = gr.Markdown(label='Output', value="""| Enzyme | Organism | Substrate | Km | Unit_Km | Kcat | Unit_Kcat | Kcat/Km | Unit_Kcat/Km | Commentary[Temp] | Commentary[pH] | Commentary[Mutant] | Commentary[Cosubstrate] |
|
| 173 |
|------------|-------------------|-------------|-----|---------|------|-----------|---------|--------------|------------------|----------------|--------------------|-------------------------|
|
|
|
|
| 4 |
from PIL import Image
|
| 5 |
from pathlib import Path
|
| 6 |
import os
|
| 7 |
+
import re
|
| 8 |
+
import tiktoken
|
| 9 |
|
| 10 |
api_key = os.getenv('API_KEY')
|
| 11 |
base_url = os.getenv("BASE_URL")
|
|
|
|
| 16 |
)
|
| 17 |
|
| 18 |
|
| 19 |
+
def cal_tokens(message_data):
|
| 20 |
+
print("use tiktoken")
|
| 21 |
+
try:
|
| 22 |
+
encoding = tiktoken.encoding_for_model("gpt-3.5-turbo-0301")
|
| 23 |
+
except KeyError:
|
| 24 |
+
print("Warning: model not found. Using cl100k_base encoding.")
|
| 25 |
+
encoding = tiktoken.get_encoding("cl100k_base")
|
| 26 |
+
num_tokens = len(encoding.encode(str(message_data)))
|
| 27 |
+
return num_tokens
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
def del_references(lines):
|
| 31 |
+
# 1.mathpix md的格式:匹配\section*{REFERENCES}xxxx\section*{Table
|
| 32 |
+
pattern = r'\*\{.{0,5}(References|Reference|REFERENCES|LITERATURE CITED|Referencesand notes|Notes and references)(.*?)\\section\*\{Tables'
|
| 33 |
+
matches = re.search(pattern, lines, re.DOTALL)
|
| 34 |
+
if matches:
|
| 35 |
+
lines = lines.replace(matches[0], "\section*{Tables\n")
|
| 36 |
+
print("1.1.匹配到了References和Tables,删除了References,保留了后面的Tables")
|
| 37 |
+
else:
|
| 38 |
+
pattern = r'\*\{.{0,5}(References|Reference|REFERENCES|LITERATURE CITED|Referencesand notes|Notes and references)(.*)'
|
| 39 |
+
matches = re.search(pattern, lines, re.DOTALL)
|
| 40 |
+
if matches:
|
| 41 |
+
print("1.2.匹配到了References,删除了References")
|
| 42 |
+
lines = lines.replace(matches[0], "")
|
| 43 |
+
else:
|
| 44 |
+
# 2.md的格式:匹配 ## REFERENCES
|
| 45 |
+
pattern = r'#.{0,15}(References|Reference|REFERENCES|LITERATURE CITED|Referencesand notes|Notes and references)(.*?)(Table|Tables)'
|
| 46 |
+
matches = re.search(pattern, lines, re.DOTALL)
|
| 47 |
+
if matches:
|
| 48 |
+
lines = lines.replace(matches[0], "Tables")
|
| 49 |
+
print("2.1.匹配到了## References和Tables,删除了References,保留了后面的Tables")
|
| 50 |
+
else:
|
| 51 |
+
pattern = r'#.{0,15}(References|Reference|REFERENCES|LITERATURE CITED|Referencesand notes|Notes and references)(.*?)# SUPPLEMENTARY'
|
| 52 |
+
matches = re.search(pattern, lines, re.DOTALL)
|
| 53 |
+
if matches:
|
| 54 |
+
lines = lines.replace(matches[0], "# SUPPLEMENTARY")
|
| 55 |
+
print("2.2.匹配到了## References和# SUPPLEMENTARY,删除了References,保留了后面的# SUPPLEMENTARY")
|
| 56 |
+
else:
|
| 57 |
+
pattern = r'#.{0,15}(References|Reference|REFERENCES|LITERATURE CITED|Referencesand notes|Notes and references)(.*)\[\^0\]'
|
| 58 |
+
matches = re.search(pattern, lines, re.DOTALL)
|
| 59 |
+
if matches:
|
| 60 |
+
print("2.3.匹配到了## References和\[\^0\],删除了References和\[\^0\]之间的内容")
|
| 61 |
+
lines = lines.replace(matches[0], "[^0]")
|
| 62 |
+
else:
|
| 63 |
+
pattern = r'#.{0,15}(References|Reference|REFERENCES|LITERATURE CITED|Referencesand notes|Notes and references)(.*)'
|
| 64 |
+
matches = re.search(pattern, lines, re.DOTALL)
|
| 65 |
+
if matches:
|
| 66 |
+
print("2.4.匹配到了## References,删除了References")
|
| 67 |
+
lines = lines.replace(matches[0], "")
|
| 68 |
+
else:
|
| 69 |
+
print("没有匹配到References")
|
| 70 |
+
return lines
|
| 71 |
+
|
| 72 |
+
|
| 73 |
def extract_pdf_pypdf(pdf_dir):
|
| 74 |
try:
|
| 75 |
doc = fitz.open(pdf_dir)
|
|
|
|
| 107 |
return None
|
| 108 |
|
| 109 |
|
| 110 |
+
def openai_chat_2_step(prompt, file_content):
|
| 111 |
+
all_response = ""
|
| 112 |
+
for i in range(len(file_content)//123000 + 1):
|
| 113 |
+
text = file_content[i*123000:(i+1)*123000]
|
| 114 |
+
# step1: 拆分两部分,前半部分
|
| 115 |
+
messages = [
|
| 116 |
+
{
|
| 117 |
+
"role": "system",
|
| 118 |
+
"content": "You are an expert in information extraction from scientific literature.",
|
| 119 |
+
},
|
| 120 |
+
{"role": "user",
|
| 121 |
+
"content": "The following is a scientific article, please read it carefully: \n{" + text + "}\n" + prompt},
|
| 122 |
+
]
|
| 123 |
+
tokens = cal_tokens(messages)
|
| 124 |
+
print("step一: 抽取部分{}:".format(i))
|
| 125 |
+
print("prompt tokens:", tokens)
|
| 126 |
+
response_2_content = openai_api(messages)
|
| 127 |
+
if response_2_content:
|
| 128 |
+
all_response += response_2_content + "\n"
|
| 129 |
+
|
| 130 |
+
messages = [
|
| 131 |
+
{
|
| 132 |
+
"role": "system",
|
| 133 |
+
"content": "You are an expert in information extraction from scientific literature.",
|
| 134 |
+
},
|
| 135 |
+
{"role": "user", "content": """Provided Text:
|
| 136 |
+
'''
|
| 137 |
+
{{""" + all_response + """}}
|
| 138 |
+
'''
|
| 139 |
+
""" + """
|
| 140 |
+
Combine the above tables into one table.
|
| 141 |
+
Please pay attention to the pipe format as shown in the example below. This format is for reference only regarding the structure; the content within is not the focus of this instruction.
|
| 142 |
+
|
| 143 |
+
| Enzyme | Organism | Substrate | Km | Unit_Km | Kcat | Unit_Kcat | Kcat/Km | Unit_Kcat/Km | Commentary[Temp] | Commentary[pH] | Commentary[Mutant] | Commentary[Cosubstrate] |
|
| 144 |
+
|------------|-------------------|-------------|-----|---------|------|-----------|---------|--------------|------------------|----------------|--------------------|-------------------------|
|
| 145 |
+
| Enzyme1 | Bacillus subtilis | Substrate_A | 7.3 | mM | 6.4 | s^-1 | 1.4 × 10^4 | M^-1s^-1 | 37°C | 5.0 | WT | NADP^+ |
|
| 146 |
+
| Enzyme2 | Escherichia coli | Substrate_B | 5.9 | mM | 9.8 | s^-1 | 29000 | mM^-1min^-1 | 60°C | 10.0 | Q176E | NADPH |
|
| 147 |
+
| Enzyme3 | Homo sapiens | Substrate_C | 6.9 | mM | 15.6 | s^-1 | 43000 | µM^-1s^-1 | 65°C | 8.0 | T253S | NAD^+ |
|
| 148 |
+
"""}
|
| 149 |
+
]
|
| 150 |
+
tokens = cal_tokens(messages)
|
| 151 |
+
print("step二: 合并部分:")
|
| 152 |
+
print("prompt tokens:", tokens)
|
| 153 |
+
response = openai_api(messages)
|
| 154 |
+
return response
|
| 155 |
+
|
| 156 |
+
|
| 157 |
+
def predict(prompt, pdf_file):
|
| 158 |
if pdf_file is None:
|
| 159 |
return "Please upload a PDF file to proceed."
|
| 160 |
|
|
|
|
| 168 |
'''
|
| 169 |
{{""" + file_content + """}}
|
| 170 |
'''
|
| 171 |
+
""" + prompt}
|
| 172 |
]
|
| 173 |
+
tokens = cal_tokens(messages)
|
| 174 |
+
print("开始:抽取")
|
| 175 |
+
print("prompt tokens:", tokens)
|
| 176 |
+
# time.sleep(20) # claude 需要加这个
|
| 177 |
+
if tokens > 128000:
|
| 178 |
+
file_content = del_references(file_content)
|
| 179 |
+
extract_result = openai_chat_2_step(prompt, file_content)
|
| 180 |
+
else:
|
| 181 |
+
extract_result = openai_api(messages)
|
| 182 |
|
| 183 |
return extract_result or "Too many users. Please wait a moment!"
|
| 184 |
|
|
|
|
| 257 |
with gr.Row():
|
| 258 |
viewer_button = gr.Button("View PDF", variant="secondary")
|
| 259 |
extract_button = gr.Button("Extract Text", variant="primary")
|
| 260 |
+
|
| 261 |
with gr.Row():
|
| 262 |
with gr.Column(scale=1):
|
| 263 |
+
|
| 264 |
file_out = gr.Gallery(label="PDF Viewer", columns=1, height="auto", object_fit="contain")
|
| 265 |
|
| 266 |
with gr.Column(scale=1):
|
|
|
|
| 279 |
model_input = gr.Textbox(lines=7, value=en_1, placeholder='Enter your extraction prompt here', label='Input Prompt')
|
| 280 |
exp = gr.Button("Example Prompt")
|
| 281 |
with gr.Row():
|
| 282 |
+
gen = gr.Button("Generate", variant="primary")
|
| 283 |
clr = gr.Button("Clear")
|
| 284 |
outputs = gr.Markdown(label='Output', value="""| Enzyme | Organism | Substrate | Km | Unit_Km | Kcat | Unit_Kcat | Kcat/Km | Unit_Kcat/Km | Commentary[Temp] | Commentary[pH] | Commentary[Mutant] | Commentary[Cosubstrate] |
|
| 285 |
|------------|-------------------|-------------|-----|---------|------|-----------|---------|--------------|------------------|----------------|--------------------|-------------------------|
|