gradio_bert / gradio_bert.py

Rename gradio_sindi.py to gradio_bert.py

5499fd6 verified about 2 years ago

4.5 kB

	# -- coding: utf-8 --
	"""gradio_bert.ipynb

	Automatically generated by Colab.

	Original file is located at
	https://colab.research.google.com/drive/12KZGcYbsXlMWYC8U4aeR_Ex0u8fJLgly

	# libraries
	"""


	import gradio as gr
	import torch
	from transformers import pipeline
	import numpy as np
	import pandas as pd
	from sklearn.feature_extraction.text import TfidfVectorizer
	from sklearn.metrics.pairwise import cosine_similarity
	from transformers import AutoTokenizer, AutoModelForQuestionAnswering
	import re

	"""# data - text"""

	splitted_df = pd.read_csv('/content/splitted_df_jo.csv')

	"""# getting context"""

	def remove_symbols(text: str)-> str:
	"""
	Removes specified symbols and non-ASCII characters from the input text.

	Args:
	text (str): The input text to be cleaned.

	Returns:
	str: The cleaned text with specified symbols and non-ASCII characters removed.

	Example:
	>>> text = "This is a test string / with (some) symbols.\nAnd some non-ASCII characters like é and ñ."
	>>> clean_text = remove_symbols(text)
	>>> print(clean_text)
	This is a test string with some symbols.And some non-ASCII characters like and .
	"""
	remove_list = ['/', '(', ')', '\n', '.']
	remove_chars = "".join(remove_list)
	cleaned_text = "".join([char for char in text if char not in remove_chars])

	# Remove non-ASCII characters
	pattern_ascii = r'[^\x00-\x7F]' # Matches any character outside the ASCII range
	filtered_text = re.sub(pattern_ascii, '', cleaned_text)

	return filtered_text


	def context_func(message: str)-> str:
	"""
	Finds the most similar context from a collection of texts based on TF-IDF vectorization and cosine similarity.

	Args:
	message (str): The input message or question.

	Returns:
	str: The most similar context to the input message from the collection of texts.

	Example:
	>>> message = "What are the symptoms of breast cancer?"
	>>> similar_context = context_func(message)
	>>> print(similar_context)
	Breast cancer is the most common cancer among women worldwide...
	"""
	# Create a TF-IDF vectorizer
	vectorizer = TfidfVectorizer()

	# Convert abstracts and question to TF-IDF vectors
	text_tfidf = vectorizer.fit_transform(splitted_df["section_text"])
	question_tfidf = vectorizer.transform([message])

	# Calculate cosine similarity between question and each abstract
	similarities = cosine_similarity(question_tfidf, text_tfidf)[0]

	# Find the index of the most similar abstract
	most_similar_index = similarities.argmax()

	# Get the most similar abstract
	most_similar_context = splitted_df["section_text"][most_similar_index]
	most_similar_context = remove_symbols(most_similar_context)

	return most_similar_context

	"""# the model"""

	!huggingface-cli login

	tokenizer = AutoTokenizer.from_pretrained("nlp-group/sindi-bert-final")
	model = AutoModelForQuestionAnswering.from_pretrained("nlp-group/sindi-bert-final")

	def answer_question(question: str)-> str, str:
	"""
	Generates an answer to the input question based on the provided context.

	Args:
	question (str): The input question.

	Returns:
	tuple: A tuple containing the generated answer and the context used for answering.

	Example:
	>>> question = "What is the capital of France?"
	>>> answer, context = answer_question(question)
	>>> print("Answer:", answer)
	>>> print("Context:", context)
	"""
	context = context_func(question)
	# Tokenize the inputs
	inputs = tokenizer(question, context, return_tensors="pt", max_length=512, truncation=True)

	# Get the answer from the model
	outputs = model(**inputs)
	answer_start_scores = outputs.start_logits
	answer_end_scores = outputs.end_logits
	answer_start = torch.argmax(answer_start_scores)
	answer_end = torch.argmax(answer_end_scores) + 1
	answer = tokenizer.decode(inputs["input_ids"][0][answer_start:answer_end])

	return answer, context

	iface = gr.Interface(fn=answer_question,
	inputs=["text"],
	outputs=[gr.Textbox(label="Answer")],
	title="Women Cancer ChatBot",
	description="How can I help you?",
	examples=[
	["What is breast cancer?"],
	["What are treatments for cervical cancer?"]
	])

	iface.launch(debug = True)