sweetssweets commited on
Commit
ee8184c
·
verified ·
1 Parent(s): 65b1286

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +145 -0
app.py ADDED
@@ -0,0 +1,145 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import time
3
+
4
+ def extract_prediction(generated_text):
5
+ # 使用正则表达式提取[/INST]和第一个!!!之间的内容
6
+ try:
7
+ pattern = r"\[/INST\] (.*?)!!!"
8
+ match = re.search(pattern, generated_text)
9
+ print('匹配中')
10
+ if match:
11
+ print('匹配成功')
12
+ extracted_content = match.group(1)
13
+ else:
14
+ print("应该是太长了!!!")
15
+ extracted_content = generated_text
16
+ except:
17
+ print("Eror! ", generated_text)
18
+ extracted_content = generated_text # generated_text.split("\[/INST\]")
19
+ return extracted_content
20
+
21
+ def NMRExtractor(Paragraph, max_length):
22
+ import torch
23
+ import os
24
+ import json
25
+ import numpy as np
26
+ # os.environ["CUDA_VISIBLE_DEVICES"] = "6"
27
+ # # device = "cuda:0" if torch.cuda.is_available() else "cpu"
28
+ device = "cpu"
29
+ from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, HfArgumentParser, TrainingArguments, pipeline, logging
30
+ from peft import LoraConfig, PeftModel
31
+ # !pip install vllm
32
+ # from vllm import LLM, SamplingParams
33
+ # sampling_params = SamplingParams(temperature=0, top_p=1,max_tokens = 4096, stop = ['!!!'])
34
+
35
+ merged_dir = f"/home/zhangwei/wqg_NMR_data/extract_NMR_Paragraph_add_line_strip/code/saved_models/Mistral-7B-Instruct-v0.2/split_train_800_without_prompt_lr5e-06_bs1/checkpoint-1600"
36
+
37
+ # Reload model in FP16 and merge it with LoRA weights
38
+ base_model = AutoModelForCausalLM.from_pretrained(
39
+ merged_dir,
40
+ #low_cpu_mem_usage=True,
41
+ return_dict=True,
42
+ torch_dtype=torch.float32,
43
+ device_map=device,
44
+ )
45
+
46
+ # Reload tokenizer to save it
47
+ tokenizer = AutoTokenizer.from_pretrained(merged_dir, trust_remote_code=True)
48
+ tokenizer.pad_token = tokenizer.eos_token
49
+ tokenizer.padding_side = "right"
50
+
51
+ #llm = LLM(model=merged_dir,device=device)
52
+
53
+ prom = '''Extract text containing 1H NMR and 13C NMR data, remove interference information such as reactants, raw materials, solvents and other non-final product names based on text semantics, and then extract the name, code or number of the final product. Please delete the IUPAC name Alias, numbers and ordinal numbers before and after fields, such as '2.1.3.', '(HL4)', '(9)', '(4d)'. NMR text should contain complete information, such as instrument power and solvent information, For example, "13C NMR text": "13C NMR (400 MHz, acetone-d6) 174.0 (C), 157.7 (C). Then split the NMR text. The content in NMR conditions is NMR instrument power and solvent information, such as "13C NMR conditions": "400MHz, acetone-d6". The content in the 13C NMR data removes information such as the position and shape of the peak, such as "13C NMR data": "131.4–128.0, 157.7". The content in the 1H NMR data should include information such as the position and shape of the peak, such as "1H NMR data": "12.57 (s, 1H), 7.97–7.95 (d, J = 8.25 Hz, 2H)". Please keep the duplicate values of the original data and do not modify the number of decimal places. All responses must originate from information extracted from the given text, ensuring that the extracted content has not been modified or fragmented, and that capitalization and punctuation are exactly the same as the given text. Must end with {"IUPAC":"text","1H NMR text":"text","1H NMR conditions":"text","1H NMR data":"text","13C NMR text":"text","13C NMR conditions":"text","13C NMR data":"text"} format reply.'''
54
+
55
+ #prompt = f"{prom} {Paragraph}"
56
+ prompt = (f"<s>[INST] {prom} {Paragraph} [/INST]")
57
+
58
+ import math
59
+ import re
60
+ # Generate texts from the prompts. The output is a list of RequestOutput objects
61
+ # that contain the prompt, generated text, and other information.
62
+
63
+ pipe = pipeline(task="text-generation", model=base_model, tokenizer=tokenizer, max_length=max_length)
64
+ print('prompt',prompt)
65
+ result = pipe(f"{prompt}")
66
+ print('result',result)
67
+ generation = result[0]['generated_text']
68
+ print('generation',generation)
69
+ generated_text = extract_prediction(generation)
70
+ #output = llm.generate(prompt, sampling_params)
71
+ print('generated_text',generated_text)
72
+ #generated_text = output[0].outputs[0].text.strip()
73
+
74
+ #predictions_prob = math.exp(output[0].outputs[0].cumulative_logprob)
75
+
76
+ try:
77
+ if generated_text is np.nan:
78
+ return {}
79
+ else:
80
+ result = json.loads(str(generated_text).replace("'", "\""))
81
+ print('@@@@@@@@@@@@@@@@@@@@@@@@@@@@')
82
+ print(result.keys())
83
+ return result['IUPAC'],result['1H NMR text'],result['1H NMR conditions'],result['1H NMR data'],result['13C NMR text'],result['13C NMR conditions'],result['13C NMR data']
84
+ except (ValueError, TypeError, json.JSONDecodeError):
85
+ print('####################################')
86
+ pattern = r'"(IUPAC|1H NMR text|1H NMR conditions|1H NMR data|13C NMR text|13C NMR conditions|13C NMR data)":"(.*?)"'
87
+ matches = re.findall(pattern, generated_text)
88
+ result = {key: value for key, value in matches}
89
+
90
+ keys = ["IUPAC", "1H NMR text", "1H NMR conditions", "1H NMR data", "13C NMR text", "13C NMR conditions", "13C NMR data"]
91
+
92
+ for key in keys:
93
+ result[key] = result.get(key, 'N/A')
94
+ print(result)
95
+ return result['IUPAC'],result['1H NMR text'],result['1H NMR conditions'],result['1H NMR data'],result['13C NMR text'],result['13C NMR conditions'],result['13C NMR data']
96
+ #return 1,2,3,4,5,6,7
97
+ demo = gr.Interface(
98
+ fn=NMRExtractor,
99
+ inputs=[gr.Textbox(label="Paragraph", lines=25,info='Paragraph'), gr.Slider(value=2048, minimum=2048, maximum=4096, step=1024),
100
+ ],
101
+ outputs=[gr.Textbox(label="IUPAC", lines=1,info='IUPAC name'),
102
+ gr.Textbox(label="1H NMR text", lines=1),
103
+ gr.Textbox(label="1H NMR conditions", lines=1),
104
+ gr.Textbox(label="1H NMR data", lines=1),
105
+ gr.Textbox(label="13C NMR text", lines=1),
106
+ gr.Textbox(label="13C NMR conditions", lines=1),
107
+ gr.Textbox(label="13C NMR data", lines=1)],
108
+ title = 'NMRExtractor (CPU version)',
109
+ # description = """
110
+ # 🔔**Guideline**
111
+ # 1. Upload your image or select one from the examples.
112
+ # 2. Set up the arguments: "Num. of anchors" and "Colorization resolution".
113
+ # 3. Run the colorization (two modes supported):
114
+ # - 📀Automatic mode: **Click** "Colorize" to get the automatically colorized output.
115
+ # - ✏️Editable mode: **Check** ""Show editable anchors"; **Click** "Predict anchors"; **Redraw** the anchor colors (only anchor region will be used); **Click** "Colorize" to get the result.
116
+ # """,
117
+ description = """
118
+ 🔔**Guide**\n
119
+ 🏆This demo is the CPU version of NMRExtractor. Since GPU and inference acceleration package are not used, the inference speed is about 5 minutes per item. You can choose a suitable max_length according to the length of the input text to improve the inference speed. \n
120
+ 💡 We have released the GPU version code of NMRExtractor on github, and the inference speed is an average of 0.5 seconds per item. https://github.com/.../NMRExtractor\n
121
+ ✏️1. Enter your text or click to select one from the examples below.\n
122
+ 📀2. Set parameters: "max_length": modify according to the length of the text you enter.\n
123
+ 🏆3. After clicking submit, wait about 300 seconds⏰.
124
+ """,
125
+
126
+ article = 'https://github.com/.../NMRExtractor',
127
+
128
+ examples=[
129
+ [ "Description of 5-(4-(3-chloro-5-(trifluoromethyl)pyridin-2-yl)phenoxy)-N-(2,4-difluorophenyl)-2-nitrobenzamide (5i): White solid, yield 78.1%. m.p. 159.1–161.5 °C; 1H NMR (400 MHz, CDCl3) δ 8.84 (s, 1H), 8.25 (s, 1H), 8.09 (dd, J = 5.1, 3.7 Hz, 2H), 7.86 (d, J = 8.6 Hz, 2H), 7.52 (s, 1H), 7.42 (d, J = 8.1 Hz, 1H), 7.33 (t, J = 8.2 Hz, 1H), 7.22 (d, J = 8.5 Hz, 2H), 7.15–7.10 (m, 2H), 7.02 (d, J = 8.1 Hz, 1H); 13C NMR (101 MHz, CDCl3) δ 163.78, 162.18, 158.52, 155.36, 144.36, 140.24, 135.60, 134.94, 134.50, 131.93, 130.20, 127.62, 126.25, 123.49, 123.38, 120.21, 118.70, 116.89, 111.70, 111.48, 103.80. HRMS calcd. for C25H13ClF5N3O4 [M–H]− 548.0422, found 548.0422.", 2048],
130
+ [ "Compound 11 (Supplementary Figures S24 and S25): Light yellow oil. ESI-MS m/z: 259 [M + H]+. 1H-NMR (700 MHz, CD3OD) δ: 7.86 (1H, d, J = 1.2 Hz, H-6), 5.91 (1H, d, J = 4.7 Hz, H-1′), 4.20 − 4.16 (2H, m, H-2′, 3′), 4.00 (1H, dt, J = 4.2, 2.9 Hz, H-4′), 3.86 (1H, dd, J = 12.2, 2.7 Hz, H-5’a), 3.75 (1H, dd, J = 12.2, 3.0 Hz, H-5’b), 1.89 (3H, d, J = 1.2 Hz, 5-Me); 13C-NMR (175 MHz, CD3OD) δ: 166.4 (C-4), 152.7 (C-2), 138.4 (C-6), 111.5 (C-5), 90.4 (C-1′), 86.3 (C-4′), 75.5 (C-2′), 71.3 (C-3′), 62.3 (C-5′), and 12.4 (5-Me). The above data are in general agreement with the NMR data reported in the literature, so the compound was identified as 1-(β-D-ribofuranosyl)thymine [38].", 2048]
131
+ [ "3.2.5. (1-([1,1′-Biphenyl]-4-yl)Ethyl)Dimethylsilane, 4f; Off-White Solid 1H NMR (300 MHz, CDCl3), δ: 7.64–7.58 (m, 2H), 7.50–7.55 (m, 2H), 7.40–7.48 (m, 2H), 7.29–7.37 (m, 1H), 7.14–7.21 (m, 2H), 3.90 (pd, J = 3.65, 2.70 Hz, 1H), 2.35 (qd, J = 7.48, 2.78 Hz, 1H), 1.46 (d, J = 7.50 Hz, 3H), 0.07 (dd, J = 11.01, 3.61 Hz, 6H); 13C NMR (75 MHz, CDCl3), δ: 144.5, 141.0, 137.2, 128.6, 127.1, 126.9, 126.7, 126.8, 27.4, 15.2, −6.0 Conforms to the literature [32].", 2048],
132
+ [ "(4aR*,10S*,10aR*)-6-Methoxy-2-methyl-1,2,3,4,10,10a-hexahydro-4aH-chromeno[3,2-c]pyridine-4a,10-diol (7d). White crystals, yield 74%, m.p. = 134–135 °C. 1H NMR (600 MHz, DMSO-d6) δ (ppm): 1.78–1.87 (m, 2H), 1.90–1.93 (m, 1H), 1.93–1.96 (m, 1H), 2.12–2.16 (m, 1H), 2.22 (s, 3H), 2.67–2.73 (m, 1H), 2.97–3.02 (m, 1H), 3.71 (s, 3H), 4.31 (d, J = 10.8 Hz, 1H), 5.17 (s, 1H), 6.32 (s, 1H), 6.80–6.82 (m, 2H), 7.00–7.02 (m, 1H). 13C NMR (DMSO-d6, 151 MHz) δ (ppm): 36.8, 45.4, 45.7, 52.2, 55.3, 55.3, 63.4, 96.7, 110.4, 118.9, 119.4, 128.4, 140.9, 147.6. HRMS (MALDI+) m/z calcd for C14H19NO4 in form of [M + H]+ ion 266.1392, found: 266.1405.", 2048],
133
+ [ "3.2.1. 5a. 2-[3-Amino-5-methyl-5-(pyridin-3-yl)-1,5-dihydro-4H-1,2,4-triazol-4-yl]propanoic Acid IR (KBr, cm−1) 3348, 3258, 2854, 2560, 1670, 1546, 1416, 1343, 1187, 1064, 747. 1H NMR (400 MHz, DMSO-d6): δ 1.25 (s, 3H, CH3), 1.53 (d, 3H, CH3), 3.88 (q, 1H, CH), 4.33 (s, 2H, NH2), 7.00 (s, 1H, NH), 7.58–8.60 (m, 4H, Ar–H), 11.10 (s, 1H, OH) ppm; 13C NMR (100 MHz, DMSO-d6): δ 26.16, 27.30, 42.68, 72.35, 99.49, 121.21, 123.74, 136.02, 146.08, 158.55, 174.67 ppm. Anal. Calcd. for C11H15N5O2: C, 53.00, H, 6.07, N, 28.10. Found: C, 52.82, H, 6.09, N, 28.07.", 2048]
134
+ ],
135
+
136
+ ## guiline
137
+ )
138
+ # with demo:
139
+ # gr.Markdown(value="""
140
+ # **Gradio demo for DISCO: Disentangled Image Colorization via Global Anchors**. Check our [project page](https://menghanxia.github.io/projects/disco.html) 😛.
141
+ # """)
142
+ # gr.HTML(value="""
143
+ # <p style="text-align:center; color:orange"><a href='https://menghanxia.github.io/projects/disco.html' target='_blank'>DISCO Project Page</a> | <a href='https://github.com/MenghanXia/DisentangledColorization' target='_blank'>Github Repo</a></p>
144
+ # """)
145
+ demo.launch(share=True)