NMRExtractor / app.py
sweetssweets's picture
Update app.py
a8fb8ab verified
import gradio as gr
import time
import torch
import json
import numpy as np
import math
import re
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, HfArgumentParser, TrainingArguments, pipeline, logging
device = "cpu"
def extract_prediction(generated_text):
try:
pattern = r" \[/INST\] (.*?)!!!"
match = re.search(pattern, generated_text)
if match:
extracted_content = match.group(1)
else:
extracted_content = generated_text
except:
print("Eror! ", generated_text)
extracted_content = generated_text # generated_text.split("\[/INST\]")
return extracted_content
def NMRExtractor(Paragraph, max_length):
model_path = f"sweetssweets/NMRExtractor"
model = AutoModelForCausalLM.from_pretrained(
model_path,
low_cpu_mem_usage = True,
return_dict = True,
torch_dtype = torch.bfloat16,
device_map = device,
)
# Reload tokenizer to save it
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"
prom = '''Extract text containing 1H NMR and 13C NMR data, remove interference information such as reactants, raw materials, solvents and other non-final product names based on text semantics, and then extract the name, code or number of the final product. Please delete the IUPAC name Alias, numbers and ordinal numbers before and after fields, such as '2.1.3.', '(HL4)', '(9)', '(4d)'. NMR text should contain complete information, such as instrument power and solvent information, For example, "13C NMR text": "13C NMR (400 MHz, acetone-d6) 174.0 (C), 157.7 (C). Then split the NMR text. The content in NMR conditions is NMR instrument power and solvent information, such as "13C NMR conditions": "400MHz, acetone-d6". The content in the 13C NMR data removes information such as the position and shape of the peak, such as "13C NMR data": "131.4–128.0, 157.7". The content in the 1H NMR data should include information such as the position and shape of the peak, such as "1H NMR data": "12.57 (s, 1H), 7.97–7.95 (d, J = 8.25 Hz, 2H)". Please keep the duplicate values of the original data and do not modify the number of decimal places. All responses must originate from information extracted from the given text, ensuring that the extracted content has not been modified or fragmented, and that capitalization and punctuation are exactly the same as the given text. Must end with {"IUPAC":"text","1H NMR text":"text","1H NMR conditions":"text","1H NMR data":"text","13C NMR text":"text","13C NMR conditions":"text","13C NMR data":"text"} format reply.'''
prompt = (f"<s>[INST] {prom} {Paragraph} [/INST]")
if max_length == 1024:
gr.Warning("The text length exceeds max_length, and the large model output will be truncated. Please refresh the page to increase max_length and submit again!!!")
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=max_length)
print('prompt',prompt)
result = pipe(f"{prompt}",max_length=max_length,truncation=True)
print('result',result)
generation = result[0]['generated_text']
print('generation',generation)
generated_text = extract_prediction(generation)
print('generated_text',generated_text)
try:
if generated_text is np.nan:
return {}
else:
result = json.loads(str(generated_text).replace("'", "\""))
print(result.keys())
return result['IUPAC'],result['1H NMR text'],result['1H NMR conditions'],result['1H NMR data'],result['13C NMR text'],result['13C NMR conditions'],result['13C NMR data']
except (ValueError, TypeError, json.JSONDecodeError):
pattern = r'"(IUPAC|1H NMR text|1H NMR conditions|1H NMR data|13C NMR text|13C NMR conditions|13C NMR data)":"(.*?)"'
matches = re.findall(pattern, generated_text)
result = {key: value for key, value in matches}
keys = ["IUPAC", "1H NMR text", "1H NMR conditions", "1H NMR data", "13C NMR text", "13C NMR conditions", "13C NMR data"]
for key in keys:
result[key] = result.get(key, 'N/A')
print(result)
message = "After the test is run, the output length exceeds max_length, please increase max_length!!!"
if result['IUPAC']==result['13C NMR data'] ==result['1H NMR data'] == 'text':
gr.Warning("The text length exceeds max_length, and the large model output is truncated. Please increase max_length!!!")
return message,message,message,message,message,message,message
return result['IUPAC'],result['1H NMR text'],result['1H NMR conditions'],result['1H NMR data'],result['13C NMR text'],result['13C NMR conditions'],result['13C NMR data']
demo = gr.Interface(
fn=NMRExtractor,
inputs=[gr.Textbox(label="Paragraph", lines=25,info='Paragraph'), gr.Slider(value=1024, minimum=1024, maximum=4096, step=1024),
],
outputs=[gr.Textbox(label="IUPAC", lines=1,info='IUPAC name'),
gr.Textbox(label="1H NMR text", lines=1),
gr.Textbox(label="1H NMR conditions", lines=1),
gr.Textbox(label="1H NMR data", lines=1),
gr.Textbox(label="13C NMR text", lines=1),
gr.Textbox(label="13C NMR conditions", lines=1),
gr.Textbox(label="13C NMR data", lines=1)],
title = 'NMRExtractor (CPU version)',
description = """
πŸ””**Guide**\n
πŸ† This demo is the CPU version of NMRextractor. Since GPU acceleration is not used, the CPU inference speed that huggingface can use for free is very slow. We tested that when max_length is 2048, it takes 1 hour to infer one item.⏰ \n
πŸ’‘ We have released the CPU and GPU versions of NMRExtractor on github. The CPU inference speed of our local computer is 1 item per 5 minutes, and the single-card GPU inference speed is 2 items per 0.5 seconds. https://github.com/eat-sugar/NMRExtractor.\n
⏰⏰⏰**Because it takes too long to run this demo on huggingface's free server!!! We strongly recommend that you use the tutorials and codes provided by https://github.com/eat-sugar/NMRExtractor to deploy your NMRExtractor locally.**\n
✏️1. Enter your text or click to select one from the examples below.\n
πŸ“€2. Setting parameters: "max_length": Modify according to the length of the text you enter. Since our fine-tuning prompt takes up more input tokens, your max_length needs to be set to at least 2048.\n
πŸ†3. After clicking submit, because the free CPU runs very slowly, you need to wait for a long time⏰.
""",
article = 'NMRExtractor Code and Tutorial: https://github.com/eat-sugar/NMRExtractor',
examples=[
[ "Description of 5-(4-(3-chloro-5-(trifluoromethyl)pyridin-2-yl)phenoxy)-N-(2,4-difluorophenyl)-2-nitrobenzamide (5i): White solid, yield 78.1%. m.p. 159.1–161.5 Β°C; 1H NMR (400 MHz, CDCl3) Ξ΄ 8.84 (s, 1H), 8.25 (s, 1H), 8.09 (dd, J = 5.1, 3.7 Hz, 2H), 7.86 (d, J = 8.6 Hz, 2H), 7.52 (s, 1H), 7.42 (d, J = 8.1 Hz, 1H), 7.33 (t, J = 8.2 Hz, 1H), 7.22 (d, J = 8.5 Hz, 2H), 7.15–7.10 (m, 2H), 7.02 (d, J = 8.1 Hz, 1H); 13C NMR (101 MHz, CDCl3) Ξ΄ 163.78, 162.18, 158.52, 155.36, 144.36, 140.24, 135.60, 134.94, 134.50, 131.93, 130.20, 127.62, 126.25, 123.49, 123.38, 120.21, 118.70, 116.89, 111.70, 111.48, 103.80. HRMS calcd. for C25H13ClF5N3O4 [M–H]βˆ’ 548.0422, found 548.0422.", 1024],
#[ "Compound 11 (Supplementary Figures S24 and S25): Light yellow oil. ESI-MS m/z: 259 [M + H]+. 1H-NMR (700 MHz, CD3OD) Ξ΄: 7.86 (1H, d, J = 1.2 Hz, H-6), 5.91 (1H, d, J = 4.7 Hz, H-1β€²), 4.20β€‰βˆ’β€‰4.16 (2H, m, H-2β€², 3β€²), 4.00 (1H, dt, J = 4.2, 2.9 Hz, H-4β€²), 3.86 (1H, dd, J = 12.2, 2.7 Hz, H-5’a), 3.75 (1H, dd, J = 12.2, 3.0 Hz, H-5’b), 1.89 (3H, d, J = 1.2 Hz, 5-Me); 13C-NMR (175 MHz, CD3OD) Ξ΄: 166.4 (C-4), 152.7 (C-2), 138.4 (C-6), 111.5 (C-5), 90.4 (C-1β€²), 86.3 (C-4β€²), 75.5 (C-2β€²), 71.3 (C-3β€²), 62.3 (C-5β€²), and 12.4 (5-Me). The above data are in general agreement with the NMR data reported in the literature, so the compound was identified as 1-(Ξ²-D-ribofuranosyl)thymine [38].", 1024],
#[ "3.2.5. (1-([1,1β€²-Biphenyl]-4-yl)Ethyl)Dimethylsilane, 4f; Off-White Solid 1H NMR (300 MHz, CDCl3), Ξ΄: 7.64–7.58 (m, 2H), 7.50–7.55 (m, 2H), 7.40–7.48 (m, 2H), 7.29–7.37 (m, 1H), 7.14–7.21 (m, 2H), 3.90 (pd, J = 3.65, 2.70 Hz, 1H), 2.35 (qd, J = 7.48, 2.78 Hz, 1H), 1.46 (d, J = 7.50 Hz, 3H), 0.07 (dd, J = 11.01, 3.61 Hz, 6H); 13C NMR (75 MHz, CDCl3), Ξ΄: 144.5, 141.0, 137.2, 128.6, 127.1, 126.9, 126.7, 126.8, 27.4, 15.2, βˆ’6.0 Conforms to the literature [32].", 2048],
#[ "(4aR*,10S*,10aR*)-6-Methoxy-2-methyl-1,2,3,4,10,10a-hexahydro-4aH-chromeno[3,2-c]pyridine-4a,10-diol (7d). White crystals, yield 74%, m.p. = 134–135 Β°C. 1H NMR (600 MHz, DMSO-d6) Ξ΄ (ppm): 1.78–1.87 (m, 2H), 1.90–1.93 (m, 1H), 1.93–1.96 (m, 1H), 2.12–2.16 (m, 1H), 2.22 (s, 3H), 2.67–2.73 (m, 1H), 2.97–3.02 (m, 1H), 3.71 (s, 3H), 4.31 (d, J = 10.8 Hz, 1H), 5.17 (s, 1H), 6.32 (s, 1H), 6.80–6.82 (m, 2H), 7.00–7.02 (m, 1H). 13C NMR (DMSO-d6, 151 MHz) Ξ΄ (ppm): 36.8, 45.4, 45.7, 52.2, 55.3, 55.3, 63.4, 96.7, 110.4, 118.9, 119.4, 128.4, 140.9, 147.6. HRMS (MALDI+) m/z calcd for C14H19NO4 in form of [M + H]+ ion 266.1392, found: 266.1405.", 2048],
#[ "3.2.1. 5a. 2-[3-Amino-5-methyl-5-(pyridin-3-yl)-1,5-dihydro-4H-1,2,4-triazol-4-yl]propanoic Acid IR (KBr, cmβˆ’1) 3348, 3258, 2854, 2560, 1670, 1546, 1416, 1343, 1187, 1064, 747. 1H NMR (400 MHz, DMSO-d6): Ξ΄ 1.25 (s, 3H, CH3), 1.53 (d, 3H, CH3), 3.88 (q, 1H, CH), 4.33 (s, 2H, NH2), 7.00 (s, 1H, NH), 7.58–8.60 (m, 4H, Ar–H), 11.10 (s, 1H, OH) ppm; 13C NMR (100 MHz, DMSO-d6): Ξ΄ 26.16, 27.30, 42.68, 72.35, 99.49, 121.21, 123.74, 136.02, 146.08, 158.55, 174.67 ppm. Anal. Calcd. for C11H15N5O2: C, 53.00, H, 6.07, N, 28.10. Found: C, 52.82, H, 6.09, N, 28.07.", 2048]
],
)
demo.launch(share=True)