Spaces:

mvasani
/

Patentatbility_score_app

Sleeping

mvasani3690

Update app.py

ec0b6af unverified over 2 years ago

4.3 kB

	import streamlit as st
	from transformers import AutoTokenizer, AutoModelForSequenceClassification
	import torch

	import torch
	from pprint import pprint
	from datasets import load_dataset

	# ----- Data Loading ------
	dataset_dict = load_dataset('HUPD/hupd',
	name='sample',
	data_files="https://huggingface.co/datasets/HUPD/hupd/blob/main/hupd_metadata_2022-02-22.feather",
	icpr_label=None,
	train_filing_start_date='2016-01-01',
	train_filing_end_date='2016-01-21',
	val_filing_start_date='2016-01-22',
	val_filing_end_date='2016-01-31',
	)
	# Here we can see the `train` and `val` splits, along with the
	# location of the cached data files
	print('Dataset contents:')
	print(dataset_dict)

	print('Dataset cache location:')
	print(dataset_dict.cache_files)

	# Data
	train_dataset = dataset_dict["train"]
	val_dataset = dataset_dict["validation"]
	print(f'Train dataset shape: {train_dataset.shape}')
	print(f'Validation dataset shape: {val_dataset.shape}')

	# List all available fields
	print(f'Dataset fields:')
	print(train_dataset.column_names)

	# Example: preprocess the abstract field of the dataset
	# using HF tokenizers
	from transformers import AutoTokenizer
	tokenizer = AutoTokenizer.from_pretrained('roberta-base')

	# We tokenize in batches, so tokenization is quite fast
	train_dataset = train_dataset.map(
	lambda e: tokenizer(e['abstract'], truncation=True, padding='max_length'),
	batched=True,
	desc="Tokenizing training files"
	)
	val_dataset = val_dataset.map(
	lambda e: tokenizer(e['abstract'], truncation=True, padding='max_length'),
	batched=True,
	desc="Tokenizing training files"
	)

	# Since we've tokenized the dataset, we have a new cache location
	print('Dataset cache location after tokenization:')
	print(train_dataset.cache_files)

	# And we have added some fields to our dataset
	print('Dataset fields after tokenization:')
	print(train_dataset.column_names)


	# Load the BERT tokenizer and model for sequence classification
	tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')
	model = AutoModelForSequenceClassification.from_pretrained('bert-base-cased', num_labels=6)

	# Function to retrieve abstract and claims text based on filing number
	def get_text_data(filing_number):
	# Check if the filing number exists in the dataset
	if filing_number >= len(train_dataset) or filing_number < 0:
	return None, None # Return None if the filing number is out of range or negative

	# Access the data corresponding to the filing number
	data = train_dataset[filing_number]

	# Retrieve the abstract and claims text from the data
	abstract = data.get('abstract', None)
	claims = data.get('claims', None)

	return abstract, claims



	# Streamlit app

	st.markdown("Link to app - [Patentabiity app](https://huggingface.co/spaces/mvasani/Patentatbility_score_app)")
	def main():
	st.title("Patentability Score App")

	# Dropdown menu to select the application filing number
	filing_number = st.selectbox("Select Application Filing Number", range(len(train_dataset)))

	# Display abstract and claims text boxes based on selected filing number
	abstract, claims = get_text_data(filing_number)
	st.subheader("Abstract:")
	st.text_area("Abstract Text", abstract, height=200, key='abstract_text')
	st.subheader("Claims:")
	st.text_area("Claims Text", claims, height=400, key='claims_text')

	# Submit button to calculate and display the patentability score
	if st.button("Submit"):
	# Tokenize the abstract and claims texts
	inputs = tokenizer(abstract, claims, return_tensors="pt", padding=True, truncation=True)

	# Perform inference with the model to get the logits
	with torch.no_grad():
	logits = model(**inputs).logits

	# Calculate the patentability score
	score = torch.softmax(logits, dim=1).tolist()[0]

	# Display the patentability score
	st.subheader("Patentability Score:")
	st.write("REJECTED:", score[0])
	st.write("ACCEPTED:", score[1])
	st.write("PENDING:", score[2])
	st.write("CONT-REJECTED:", score[3])
	st.write("CONT-ACCEPTED:", score[4])
	st.write("CONT-PENDING:", score[5])

	if __name__ == "__main__":
	main()