panotedi commited on
Commit
d042411
·
unverified ·
1 Parent(s): 449d64f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +111 -7
app.py CHANGED
@@ -1,14 +1,118 @@
1
  import streamlit as st
 
2
  from transformers import pipeline
 
 
 
 
 
3
  st.title("CS634 - milestone2 - Tedi Pano")
4
 
5
- sentiment_model = pipeline("sentiment-analysis")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
 
7
- with st.form("my_form"):
8
- text_input = st.text_input("Enter in a sentence for sentiment analysis" , "I love you so much it hurts sometimes.")
9
- submitted = st.form_submit_button("Submit")
10
 
 
 
11
 
12
- if submitted:
13
- output = sentiment_model(text_input)
14
- st.write("The sentiment analysis for '" + text_input+ "' is " + output[0]['label'] + " with a certainty score of " + str(output[0]['score']))
 
 
 
 
 
 
 
 
 
1
  import streamlit as st
2
+ import pandas as pd
3
  from transformers import pipeline
4
+
5
+ from pprint import pprint
6
+ from datasets import load_dataset
7
+ from torch.utils.data import DataLoader
8
+
9
  st.title("CS634 - milestone2 - Tedi Pano")
10
 
11
+ @st.cache_resource
12
+ def load_data():
13
+ dataset_dict = load_dataset('HUPD/hupd',
14
+ name='sample',
15
+ data_files="https://huggingface.co/datasets/HUPD/hupd/blob/main/hupd_metadata_2022-02-22.feather",
16
+ icpr_label=None,
17
+ train_filing_start_date='2016-01-01',
18
+ train_filing_end_date='2016-01-21',
19
+ val_filing_start_date='2016-01-22',
20
+ val_filing_end_date='2016-01-31',
21
+ )
22
+
23
+ st.write('Loading is done!')
24
+ return dataset_dict
25
+
26
+ @st.cache_resource
27
+ def training_computation(_dataset_dict):
28
+ df = pd.DataFrame(_dataset_dict['train'])
29
+ vf = pd.DataFrame(_dataset_dict['validation'])
30
+
31
+ accepted_rejected = ['ACCEPTED', 'REJECTED']
32
+ df = df[df['decision'].isin(accepted_rejected)]
33
+ df['patentability_score'] = df['decision'].map({'ACCEPTED': 1, 'REJECTED': 0})
34
+ vf = vf[vf['decision'].isin(accepted_rejected)]
35
+ vf['patentability_score'] = vf['decision'].map({'ACCEPTED': 1, 'REJECTED': 0})
36
+
37
+ st.write("Processed the data")
38
+
39
+
40
+ from sklearn.model_selection import train_test_split
41
+ dftrain, dftest = train_test_split(df, test_size = 0.90, random_state = 0)
42
+
43
+ from transformers import DistilBertTokenizerFast
44
+ tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
45
+
46
+ X_dtrain = dftrain['abstract'].tolist()
47
+ y_dtrain = dftrain['patentability_score'].tolist()
48
+
49
+ X_vtrain = vf['abstract'].tolist()
50
+ y_vtrain = vf['patentability_score'].tolist()
51
+
52
+ X_dtest = dftest['abstract'].tolist()
53
+ y_dtest = dftest['patentability_score'].tolist()
54
+
55
+ train_encodings = tokenizer(X_dtrain, truncation=True, padding=True)
56
+ val_encodings = tokenizer(X_vtrain, truncation=True, padding=True)
57
+ test_encodings = tokenizer(X_dtest, truncation=True, padding=True)
58
+
59
+ st.write("tokenizing completed!")
60
+
61
+ import tensorflow as tf
62
+
63
+ train_dataset = tf.data.Dataset.from_tensor_slices((
64
+ dict(train_encodings),
65
+ y_dtrain
66
+ ))
67
+
68
+ val_dataset = tf.data.Dataset.from_tensor_slices((
69
+ dict(val_encodings),
70
+ y_vtrain
71
+ ))
72
+
73
+ test_dataset = tf.data.Dataset.from_tensor_slices((
74
+ dict(test_encodings),
75
+ y_dtest
76
+ ))
77
+
78
+ st.write("back to dataset!")
79
+
80
+ from transformers import TFDistilBertForSequenceClassification, TFTrainer, TFTrainingArguments
81
+
82
+ training_args = TFTrainingArguments(
83
+ output_dir='./results',
84
+ num_train_epochs=2,
85
+ per_device_train_batch_size=16,
86
+ per_device_eval_batch_size=16,
87
+ warmup_steps=500,
88
+ eval_steps=500,
89
+ weight_decay=0.01
90
+ )
91
+
92
+
93
+ with training_args.strategy.scope():
94
+ model = TFDistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased")
95
+
96
+ trainer = TFTrainer(
97
+ model=model,
98
+ args=training_args,
99
+ train_dataset=train_dataset,
100
+ eval_dataset=val_dataset
101
+ )
102
 
103
+ trainer.train()
 
 
104
 
105
+ st.write("training completed")
106
+ return trainer
107
 
108
+
109
+ dataset_dict = load_data()
110
+ trainer = training_computation(dataset_dict)
111
+
112
+
113
+ patents = pd.DataFrame(dataset_dict['train'])
114
+ patent_selection = st.selectbox("Select Patent",patents['patent_number'])
115
+
116
+ patent = patents.loc[patents['patent_number'] == patent_selection]
117
+ st.write(patent['abstract'])
118
+ st.write(patent['claims'])