Spaces:
Sleeping
Sleeping
mvasani3690
commited on
Add files via upload
Browse files
app.py
ADDED
|
@@ -0,0 +1,120 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import streamlit as st
|
| 2 |
+
from transformers import AutoTokenizer, AutoModelForSequenceClassification
|
| 3 |
+
import torch
|
| 4 |
+
|
| 5 |
+
import torch
|
| 6 |
+
from pprint import pprint
|
| 7 |
+
from datasets import load_dataset
|
| 8 |
+
|
| 9 |
+
# ----- Data Loading ------
|
| 10 |
+
dataset_dict = load_dataset('HUPD/hupd',
|
| 11 |
+
name='sample',
|
| 12 |
+
data_files="https://huggingface.co/datasets/HUPD/hupd/blob/main/hupd_metadata_2022-02-22.feather",
|
| 13 |
+
icpr_label=None,
|
| 14 |
+
train_filing_start_date='2016-01-01',
|
| 15 |
+
train_filing_end_date='2016-01-21',
|
| 16 |
+
val_filing_start_date='2016-01-22',
|
| 17 |
+
val_filing_end_date='2016-01-31',
|
| 18 |
+
)
|
| 19 |
+
# Here we can see the `train` and `val` splits, along with the
|
| 20 |
+
# location of the cached data files
|
| 21 |
+
print('Dataset contents:')
|
| 22 |
+
print(dataset_dict)
|
| 23 |
+
|
| 24 |
+
print('Dataset cache location:')
|
| 25 |
+
print(dataset_dict.cache_files)
|
| 26 |
+
|
| 27 |
+
# Data
|
| 28 |
+
train_dataset = dataset_dict["train"]
|
| 29 |
+
val_dataset = dataset_dict["validation"]
|
| 30 |
+
print(f'Train dataset shape: {train_dataset.shape}')
|
| 31 |
+
print(f'Validation dataset shape: {val_dataset.shape}')
|
| 32 |
+
|
| 33 |
+
# List all available fields
|
| 34 |
+
print(f'Dataset fields:')
|
| 35 |
+
print(train_dataset.column_names)
|
| 36 |
+
|
| 37 |
+
# Example: preprocess the abstract field of the dataset
|
| 38 |
+
# using HF tokenizers
|
| 39 |
+
from transformers import AutoTokenizer
|
| 40 |
+
tokenizer = AutoTokenizer.from_pretrained('roberta-base')
|
| 41 |
+
|
| 42 |
+
# We tokenize in batches, so tokenization is quite fast
|
| 43 |
+
train_dataset = train_dataset.map(
|
| 44 |
+
lambda e: tokenizer(e['abstract'], truncation=True, padding='max_length'),
|
| 45 |
+
batched=True,
|
| 46 |
+
desc="Tokenizing training files"
|
| 47 |
+
)
|
| 48 |
+
val_dataset = val_dataset.map(
|
| 49 |
+
lambda e: tokenizer(e['abstract'], truncation=True, padding='max_length'),
|
| 50 |
+
batched=True,
|
| 51 |
+
desc="Tokenizing training files"
|
| 52 |
+
)
|
| 53 |
+
|
| 54 |
+
# Since we've tokenized the dataset, we have a new cache location
|
| 55 |
+
print('Dataset cache location after tokenization:')
|
| 56 |
+
print(train_dataset.cache_files)
|
| 57 |
+
|
| 58 |
+
# And we have added some fields to our dataset
|
| 59 |
+
print('Dataset fields after tokenization:')
|
| 60 |
+
print(train_dataset.column_names)
|
| 61 |
+
|
| 62 |
+
|
| 63 |
+
# Load the BERT tokenizer and model for sequence classification
|
| 64 |
+
tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')
|
| 65 |
+
model = AutoModelForSequenceClassification.from_pretrained('bert-base-cased', num_labels=6)
|
| 66 |
+
|
| 67 |
+
# Function to retrieve abstract and claims text based on filing number
|
| 68 |
+
def get_text_data(filing_number):
|
| 69 |
+
# Check if the filing number exists in the dataset
|
| 70 |
+
if filing_number >= len(train_dataset) or filing_number < 0:
|
| 71 |
+
return None, None # Return None if the filing number is out of range or negative
|
| 72 |
+
|
| 73 |
+
# Access the data corresponding to the filing number
|
| 74 |
+
data = train_dataset[filing_number]
|
| 75 |
+
|
| 76 |
+
# Retrieve the abstract and claims text from the data
|
| 77 |
+
abstract = data.get('abstract', None)
|
| 78 |
+
claims = data.get('claims', None)
|
| 79 |
+
|
| 80 |
+
return abstract, claims
|
| 81 |
+
|
| 82 |
+
|
| 83 |
+
|
| 84 |
+
# Streamlit app
|
| 85 |
+
def main():
|
| 86 |
+
st.title("Patentability Score App")
|
| 87 |
+
|
| 88 |
+
# Dropdown menu to select the application filing number
|
| 89 |
+
filing_number = st.selectbox("Select Application Filing Number", range(len(train_dataset)))
|
| 90 |
+
|
| 91 |
+
# Display abstract and claims text boxes based on selected filing number
|
| 92 |
+
abstract, claims = get_text_data(filing_number)
|
| 93 |
+
st.subheader("Abstract:")
|
| 94 |
+
st.text_area("Abstract Text", abstract, height=200, key='abstract_text')
|
| 95 |
+
st.subheader("Claims:")
|
| 96 |
+
st.text_area("Claims Text", claims, height=400, key='claims_text')
|
| 97 |
+
|
| 98 |
+
# Submit button to calculate and display the patentability score
|
| 99 |
+
if st.button("Submit"):
|
| 100 |
+
# Tokenize the abstract and claims texts
|
| 101 |
+
inputs = tokenizer(abstract, claims, return_tensors="pt", padding=True, truncation=True)
|
| 102 |
+
|
| 103 |
+
# Perform inference with the model to get the logits
|
| 104 |
+
with torch.no_grad():
|
| 105 |
+
logits = model(**inputs).logits
|
| 106 |
+
|
| 107 |
+
# Calculate the patentability score
|
| 108 |
+
score = torch.softmax(logits, dim=1).tolist()[0]
|
| 109 |
+
|
| 110 |
+
# Display the patentability score
|
| 111 |
+
st.subheader("Patentability Score:")
|
| 112 |
+
st.write("REJECTED:", score[0])
|
| 113 |
+
st.write("ACCEPTED:", score[1])
|
| 114 |
+
st.write("PENDING:", score[2])
|
| 115 |
+
st.write("CONT-REJECTED:", score[3])
|
| 116 |
+
st.write("CONT-ACCEPTED:", score[4])
|
| 117 |
+
st.write("CONT-PENDING:", score[5])
|
| 118 |
+
|
| 119 |
+
if __name__ == "__main__":
|
| 120 |
+
main()
|