sweetssweets commited on
Commit
6053825
·
verified ·
1 Parent(s): 34a2c75

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +65 -65
app.py CHANGED
@@ -18,85 +18,85 @@ def extract_prediction(generated_text):
18
  extracted_content = generated_text # generated_text.split("\[/INST\]")
19
  return extracted_content
20
 
21
- def NMRExtractor(Paragraph, max_length):
22
- return 1,2,3,4,5,6,7
23
 
24
 
25
- # def NMRExtractor(Paragraph, max_length):
26
- # import torch
27
- # import os
28
- # import json
29
- # import numpy as np
30
- # # os.environ["CUDA_VISIBLE_DEVICES"] = "6"
31
- # # # device = "cuda:0" if torch.cuda.is_available() else "cpu"
32
- # device = "cpu"
33
- # from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, HfArgumentParser, TrainingArguments, pipeline, logging
34
- # # !pip install vllm
35
- # # from vllm import LLM, SamplingParams
36
- # # sampling_params = SamplingParams(temperature=0, top_p=1,max_tokens = 4096, stop = ['!!!'])
37
 
38
- # merged_dir = f"sweetssweets/NMRExtractor"
39
 
40
- # # Reload model in FP16 and merge it with LoRA weights
41
- # base_model = AutoModelForCausalLM.from_pretrained(
42
- # merged_dir,
43
- # low_cpu_mem_usage = True,
44
- # return_dict = True,
45
- # torch_dtype = torch.bfloat16,
46
- # device_map = device,
47
- # )
48
 
49
- # # Reload tokenizer to save it
50
- # tokenizer = AutoTokenizer.from_pretrained(merged_dir, trust_remote_code=True,truncation = True)
51
- # tokenizer.pad_token = tokenizer.eos_token
52
- # tokenizer.padding_side = "right"
53
 
54
- # #llm = LLM(model=merged_dir,device=device)
55
 
56
- # prom = '''Extract text containing 1H NMR and 13C NMR data, remove interference information such as reactants, raw materials, solvents and other non-final product names based on text semantics, and then extract the name, code or number of the final product. Please delete the IUPAC name Alias, numbers and ordinal numbers before and after fields, such as '2.1.3.', '(HL4)', '(9)', '(4d)'. NMR text should contain complete information, such as instrument power and solvent information, For example, "13C NMR text": "13C NMR (400 MHz, acetone-d6) 174.0 (C), 157.7 (C). Then split the NMR text. The content in NMR conditions is NMR instrument power and solvent information, such as "13C NMR conditions": "400MHz, acetone-d6". The content in the 13C NMR data removes information such as the position and shape of the peak, such as "13C NMR data": "131.4–128.0, 157.7". The content in the 1H NMR data should include information such as the position and shape of the peak, such as "1H NMR data": "12.57 (s, 1H), 7.97–7.95 (d, J = 8.25 Hz, 2H)". Please keep the duplicate values of the original data and do not modify the number of decimal places. All responses must originate from information extracted from the given text, ensuring that the extracted content has not been modified or fragmented, and that capitalization and punctuation are exactly the same as the given text. Must end with {"IUPAC":"text","1H NMR text":"text","1H NMR conditions":"text","1H NMR data":"text","13C NMR text":"text","13C NMR conditions":"text","13C NMR data":"text"} format reply.'''
57
 
58
- # #prompt = f"{prom} {Paragraph}"
59
- # prompt = (f"<s>[INST] {prom} {Paragraph} [/INST]")
60
 
61
- # import math
62
- # import re
63
- # # Generate texts from the prompts. The output is a list of RequestOutput objects
64
- # # that contain the prompt, generated text, and other information.
65
 
66
- # pipe = pipeline(task="text-generation", model=base_model, tokenizer=tokenizer, max_length=max_length)
67
- # print('prompt',prompt)
68
- # result = pipe(f"{prompt}",max_length=max_length,truncation=True)
69
- # print('result',result)
70
- # generation = result[0]['generated_text']
71
- # print('generation',generation)
72
- # generated_text = extract_prediction(generation)
73
- # #output = llm.generate(prompt, sampling_params)
74
- # print('generated_text',generated_text)
75
- # #generated_text = output[0].outputs[0].text.strip()
76
 
77
- # #predictions_prob = math.exp(output[0].outputs[0].cumulative_logprob)
78
 
79
- # try:
80
- # if generated_text is np.nan:
81
- # return {}
82
- # else:
83
- # result = json.loads(str(generated_text).replace("'", "\""))
84
- # print('@@@@@@@@@@@@@@@@@@@@@@@@@@@@')
85
- # print(result.keys())
86
- # return result['IUPAC'],result['1H NMR text'],result['1H NMR conditions'],result['1H NMR data'],result['13C NMR text'],result['13C NMR conditions'],result['13C NMR data']
87
- # except (ValueError, TypeError, json.JSONDecodeError):
88
- # print('####################################')
89
- # pattern = r'"(IUPAC|1H NMR text|1H NMR conditions|1H NMR data|13C NMR text|13C NMR conditions|13C NMR data)":"(.*?)"'
90
- # matches = re.findall(pattern, generated_text)
91
- # result = {key: value for key, value in matches}
92
 
93
- # keys = ["IUPAC", "1H NMR text", "1H NMR conditions", "1H NMR data", "13C NMR text", "13C NMR conditions", "13C NMR data"]
94
 
95
- # for key in keys:
96
- # result[key] = result.get(key, 'N/A')
97
- # print(result)
98
- # return result['IUPAC'],result['1H NMR text'],result['1H NMR conditions'],result['1H NMR data'],result['13C NMR text'],result['13C NMR conditions'],result['13C NMR data']
99
- # #return 1,2,3,4,5,6,7
100
  demo = gr.Interface(
101
  fn=NMRExtractor,
102
  inputs=[gr.Textbox(label="Paragraph", lines=25,info='Paragraph'), gr.Slider(value=2048, minimum=2048, maximum=4096, step=1024),
 
18
  extracted_content = generated_text # generated_text.split("\[/INST\]")
19
  return extracted_content
20
 
21
+ # def NMRExtractor(Paragraph, max_length):
22
+ # return 1,2,3,4,5,6,7
23
 
24
 
25
+ def NMRExtractor(Paragraph, max_length):
26
+ import torch
27
+ import os
28
+ import json
29
+ import numpy as np
30
+ # os.environ["CUDA_VISIBLE_DEVICES"] = "6"
31
+ # # device = "cuda:0" if torch.cuda.is_available() else "cpu"
32
+ device = "cpu"
33
+ from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, HfArgumentParser, TrainingArguments, pipeline, logging
34
+ # !pip install vllm
35
+ # from vllm import LLM, SamplingParams
36
+ # sampling_params = SamplingParams(temperature=0, top_p=1,max_tokens = 4096, stop = ['!!!'])
37
 
38
+ merged_dir = f"sweetssweets/NMRExtractor"
39
 
40
+ # Reload model in FP16 and merge it with LoRA weights
41
+ base_model = AutoModelForCausalLM.from_pretrained(
42
+ merged_dir,
43
+ low_cpu_mem_usage = True,
44
+ return_dict = True,
45
+ torch_dtype = torch.bfloat16,
46
+ device_map = device,
47
+ )
48
 
49
+ # Reload tokenizer to save it
50
+ tokenizer = AutoTokenizer.from_pretrained(merged_dir, trust_remote_code=True,truncation = True)
51
+ tokenizer.pad_token = tokenizer.eos_token
52
+ tokenizer.padding_side = "right"
53
 
54
+ #llm = LLM(model=merged_dir,device=device)
55
 
56
+ prom = '''Extract text containing 1H NMR and 13C NMR data, remove interference information such as reactants, raw materials, solvents and other non-final product names based on text semantics, and then extract the name, code or number of the final product. Please delete the IUPAC name Alias, numbers and ordinal numbers before and after fields, such as '2.1.3.', '(HL4)', '(9)', '(4d)'. NMR text should contain complete information, such as instrument power and solvent information, For example, "13C NMR text": "13C NMR (400 MHz, acetone-d6) 174.0 (C), 157.7 (C). Then split the NMR text. The content in NMR conditions is NMR instrument power and solvent information, such as "13C NMR conditions": "400MHz, acetone-d6". The content in the 13C NMR data removes information such as the position and shape of the peak, such as "13C NMR data": "131.4–128.0, 157.7". The content in the 1H NMR data should include information such as the position and shape of the peak, such as "1H NMR data": "12.57 (s, 1H), 7.97–7.95 (d, J = 8.25 Hz, 2H)". Please keep the duplicate values of the original data and do not modify the number of decimal places. All responses must originate from information extracted from the given text, ensuring that the extracted content has not been modified or fragmented, and that capitalization and punctuation are exactly the same as the given text. Must end with {"IUPAC":"text","1H NMR text":"text","1H NMR conditions":"text","1H NMR data":"text","13C NMR text":"text","13C NMR conditions":"text","13C NMR data":"text"} format reply.'''
57
 
58
+ #prompt = f"{prom} {Paragraph}"
59
+ prompt = (f"<s>[INST] {prom} {Paragraph} [/INST]")
60
 
61
+ import math
62
+ import re
63
+ # Generate texts from the prompts. The output is a list of RequestOutput objects
64
+ # that contain the prompt, generated text, and other information.
65
 
66
+ pipe = pipeline(task="text-generation", model=base_model, tokenizer=tokenizer, max_length=max_length)
67
+ print('prompt',prompt)
68
+ result = pipe(f"{prompt}",max_length=512,truncation=True)
69
+ print('result',result)
70
+ generation = result[0]['generated_text']
71
+ print('generation',generation)
72
+ generated_text = extract_prediction(generation)
73
+ #output = llm.generate(prompt, sampling_params)
74
+ print('generated_text',generated_text)
75
+ #generated_text = output[0].outputs[0].text.strip()
76
 
77
+ #predictions_prob = math.exp(output[0].outputs[0].cumulative_logprob)
78
 
79
+ try:
80
+ if generated_text is np.nan:
81
+ return {}
82
+ else:
83
+ result = json.loads(str(generated_text).replace("'", "\""))
84
+ print('@@@@@@@@@@@@@@@@@@@@@@@@@@@@')
85
+ print(result.keys())
86
+ return result['IUPAC'],result['1H NMR text'],result['1H NMR conditions'],result['1H NMR data'],result['13C NMR text'],result['13C NMR conditions'],result['13C NMR data']
87
+ except (ValueError, TypeError, json.JSONDecodeError):
88
+ print('####################################')
89
+ pattern = r'"(IUPAC|1H NMR text|1H NMR conditions|1H NMR data|13C NMR text|13C NMR conditions|13C NMR data)":"(.*?)"'
90
+ matches = re.findall(pattern, generated_text)
91
+ result = {key: value for key, value in matches}
92
 
93
+ keys = ["IUPAC", "1H NMR text", "1H NMR conditions", "1H NMR data", "13C NMR text", "13C NMR conditions", "13C NMR data"]
94
 
95
+ for key in keys:
96
+ result[key] = result.get(key, 'N/A')
97
+ print(result)
98
+ return result['IUPAC'],result['1H NMR text'],result['1H NMR conditions'],result['1H NMR data'],result['13C NMR text'],result['13C NMR conditions'],result['13C NMR data']
99
+ #return 1,2,3,4,5,6,7
100
  demo = gr.Interface(
101
  fn=NMRExtractor,
102
  inputs=[gr.Textbox(label="Paragraph", lines=25,info='Paragraph'), gr.Slider(value=2048, minimum=2048, maximum=4096, step=1024),