Spaces:

Anvilogic
/

T5-Typosquat-Detect

Sleeping

App Files Files Community

T5-Typosquat-Detect / app.py

chgrdj

Updating text and styling (#1)

dd3d098 verified about 1 year ago

raw

history blame contribute delete

2.26 kB

	import streamlit as st
	from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
	from peft import PeftModel
	import torch

	@st.cache_resource
	def load_model():
	model_id = "google/flan-t5-large"
	adapter_path = "./Flan-T5-Typosquat-detect" # Adjust to your saved adapter path

	# Load the tokenizer and model
	tokenizer = AutoTokenizer.from_pretrained(model_id)
	model = AutoModelForSeq2SeqLM.from_pretrained(model_id)
	model = PeftModel.from_pretrained(model, adapter_path)
	model = model.merge_and_unload()
	model.eval()

	return model, tokenizer
	device='cpu'
	model, tokenizer = load_model()

	st.title("Fine tuned FLAN-T5 Typosquatting Detection")
	st.markdown("This streamlit demonstrates our fine tuned model for typosquatting detection. We found that using "
	"SLMs or LLMs and prompt engineering for this task could not achieve the same accuracy as our [cross encoder](https://huggingface.co/Anvilogic/CE-typosquat-detect). "
	"We found that by fine tuning a FLAN-T5 model, we could get the same accuracy as our cross encoder model. "
	"Using an SLM like Flan allows you to output the response (here `true` or `false`) directly into another LM. ")
	st.write("Enter a potential typosquatted domain and a target domain to check if one is a variant of the other.")

	prompt_prefix = "Is the first domain a typosquat of the second:"

	potential_typosquat = st.text_input("Potential Typosquatted Domain", value="tiktok-tikto-tibyd-yjdj.com")
	target_domain = st.text_input("Legitimate Domain", value="tiktok.com")

	full_prompt = f"{prompt_prefix} {potential_typosquat} {target_domain}"

	if st.button("Check Typosquatting"):
	if potential_typosquat and target_domain:
	# Encode and generate response
	input_ids = tokenizer(full_prompt, return_tensors="pt").input_ids.to(device)
	outputs = model.generate(input_ids, max_new_tokens=20)

	# Decode the response
	prediction = tokenizer.decode(outputs[0], skip_special_tokens=True)

	# Display the result
	st.markdown(f"Is {potential_typosquat} a typosquat of {target_domain}? {prediction}")

	else:
	st.warning("Please enter both domains to perform the check.")