Spaces:

24NLPGroupO
/

EmailGeneration

Sleeping

App Files Files Community

EmailGeneration / utils.py

Mia2024

Update utils.py

51ba203 verified almost 2 years ago

raw

history blame contribute delete

3.08 kB

	from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer
	import re

	# Load saved model and tokenizer
	model_checkpoint = "24NLPGroupO/EmailGeneration"
	tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, truncation=True)
	model = AutoModelForCausalLM.from_pretrained(model_checkpoint)

	# Set up the generation pipeline
	generator = pipeline('text-generation', model=model, tokenizer=tokenizer)

	def clean_generated_text(text):
	#Basic cleaning
	text = re.sub(r'^(Re:\|Fwd:)', '', text) # Remove reply and forward marks
	text = re.sub(r'Best regards,.*$', '', text, flags=re.DOTALL) # Remove signature
	text = re.sub(r'PHONE.*$', '', text, flags=re.DOTALL) # Remove phone numbers
	text = re.sub(r'Email:.*$', '', text, flags=re.DOTALL) # Remove email addresses
	text = re.sub(r'Cc:.*$', '', text, flags=re.DOTALL) # Remove CC list
	text = re.sub(r'\* Attachments:.*', '', text, flags=re.S) # Remove Attachments
	text = re.sub(r'©️ .*$', '', text, flags=re.DOTALL) # Remove copyright and ownership statements
	text = re.sub(r'URL', '', text) # Remove URLs
	text = re.sub(r'NUMBER', '10', text) # Replace 'NUMBER' with a real number
	text = re.sub(r'CURRENCYNUMBER', 'USD 100', text) # Replace 'CURRENCYNUMBER' with a real value
	text = re.sub(r'About Us.*', '', text, flags=re.DOTALL) # Remove 'About Us' and all following text
	text = re.sub(r'\d+ [^\s]+ St\.?,?.*?\d{5}', '', text) # Remove street
	text = re.sub(r'\d+ [^\s]+ Ave\.?,?.*?\d{5}', '', text) # Remove avenues
	text = re.sub(r'\d+ [^\s]+ Rd\.?,?.*?\d{5}', '', text) # Remove roads
	text = re.sub(r'\d+ [^\s]+ Ln\.?,?.*?\d{5}', '', text) # Remove lanes
	text = re.sub(r'\d+ [^\s]+ Blvd\.?,?.*?\d{5}', '', text) # Remove boulevards
	text = re.sub(r'\d+ [^\s]+ Dr\.?,?.*?\d{5}', '', text) # Remove drives
	text = re.sub(r'\d+ [^\s]+ Ct\.?,?.*?\d{5}', '', text) # Remove courts
	return text.strip()

	def generate_email(product, gender, profession, hobby):
	input_text = f"{product} {gender} {profession} {hobby}"
	result = generator(
	input_text, # The starting text that guides the model on what to generate
	max_length=256, # Set a suitable maximum length
	top_k=40, # Consider more top options words
	top_p=0.6, # Control the probability range for word choices
	temperature=0.4, # Control the randomness of generation
	repetition_penalty=1.5, # Reduce content repetition
	num_return_sequences=2, # Generate three texts
	do_sample=True
	)
	# Clean each generated text
	cleaned_texts = [clean_generated_text(seq['generated_text']) for seq in result]
	# Choose the best text based on length and clarity
	best_text = max(cleaned_texts, key=len)
	return best_text