Upload 2 files

eb404b7 verified almost 2 years ago

5.54 kB

	import csv
	import torch
	from transformers import pipeline

	# Initialize the chatbot with half-precision
	chatbot = pipeline(
	"text-generation",
	model="mistralai/Mistral-7B-Instruct-v0.3",
	torch_dtype=torch.float16,
	device=0 # Assuming you are using a GPU
	)

	# Sentiments and their labels
	sentiments = ["Positive", "Neutral", "Negative"]

	# List of content formats to cycle through
	formats = [
	"Feature Stories", "Instructional Manuals", "FAQs", "Policy Documents", "Live Stream Descriptions",
	"Editorial Content", "Research Papers", "User Manuals", "Commentaries", "Opinion Pieces",
	"Newsletters", "Online Courses", "Photo Essays", "Annual Reports", "User-Generated Content",
	"Testimonials", "DIY Content", "How-To Videos", "Campaign Reports", "Legal Briefs",
	"Blog Posts", "Case Studies", "Tutorials", "Interviews", "Press Releases",
	"eBooks", "Infographics", "Webinars", "Podcast Descriptions", "Video Scripts",
	"Advertisements", "Forum Discussions", "Whitepapers", "Surveys", "Product Reviews",
	"Event Summaries", "Opinion Editorials", "Letters to the Editor", "Round-Up Posts",
	"Buying Guides", "Checklists", "Cheat Sheets", "Recipes", "Travel Guides",
	"Profiles", "Lists", "Q&A Sessions", "Debates", "Polls"
	]

	# List of topics to cycle through
	topics = [
	"Family", "Travel", "Politics", "Science", "Health", "Technology", "Sports",
	"Education", "Environment", "Economics", "Culture", "History", "Music",
	"Literature", "Food", "Art", "Fashion", "Entertainment", "Business",
	"Relationships", "Fitness", "Automotive", "Finance", "Real Estate", "Law",
	"Psychology", "Philosophy", "Religion", "Gardening", "DIY", "Hobbies",
	"Pets", "Career", "Marketing", "Customer Service", "Networking", "Innovation",
	"Artificial Intelligence", "Sustainability", "Social Issues", "Digital Media",
	"Programming", "Cybersecurity", "Astronomy", "Geography", "Travel Tips",
	"Cooking", "Parenting", "Productivity", "Mindfulness", "Mental Health",
	"Self-Improvement", "Leadership", "Teamwork", "Volunteering", "Nonprofits",
	"Gaming", "E-commerce", "Photography", "Videography", "Film", "Television",
	"Streaming Services", "Podcasts", "Public Speaking", "Event Planning",
	"Interior Design", "Architecture", "Urban Development", "Agriculture",
	"Climate Change", "Renewable Energy", "Space Exploration", "Biotechnology",
	"Cryptocurrency", "Blockchain", "Robotics", "Automated Systems", "Genetics",
	"Medicine", "Pharmacy", "Veterinary Science", "Marine Biology", "Ecology",
	"Conservation", "Wildlife", "Botany", "Zoology", "Geology", "Meteorology",
	"Aviation", "Maritime", "Logistics", "Supply Chain", "Human Resources",
	"Diversity and Inclusion", "Ethics", "Corporate Governance", "Public Relations",
	"Journalism", "Advertising", "Sales", "Customer Experience", "Retail",
	"Hospitality", "Tourism", "Luxury Goods", "Consumer Electronics", "Fashion Design",
	"Textiles", "Jewelry", "Cosmetics", "Skincare", "Perfume", "Toys", "Gadgets",
	"Home Appliances", "Furniture", "Home Improvement", "Landscaping", "Real Estate Investment"
	]

	# CSV file setup with utf-8 encoding and quoting minimal
	csv_file = "sentences.csv"
	with open(csv_file, mode='w', newline='', encoding='utf-8') as file:
	writer = csv.writer(file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
	writer.writerow(["text", "label"])

	# Function to ensure correct quoting
	def ensure_correct_quoting(text):
	# Check if the text is already properly quoted
	if text.startswith('"') and text.endswith('"'):
	return text
	else:
	return f'"{text}"' # Add quotes if not already present

	# Collect and save responses until reaching 100,000 rows
	row_count = 0
	format_index = 0
	topic_index = 0

	while row_count < 100000:
	for idx, sentiment in enumerate(sentiments):
	format_type = formats[format_index % len(formats)]
	format_index += 1
	topic = topics[topic_index % len(topics)]
	topic_index += 1

	# Add the current sentiment prompt with the format and topic
	prompt = f"Write a single sentence of web content in Croatian. Content type: {format_type}. Topic: {topic}. Sentiment: {sentiment}."

	response = chatbot(prompt, max_new_tokens=100) # Adjusted max_new_tokens for longer responses

	# Debug print to check response format
	print(f"Full model response: {response}")

	# Extract the generated text from the response structure
	generated_text = response[0]['generated_text']

	# Remove any part of the prompt from the generated text if it exists
	clean_text = generated_text.replace(prompt, "").strip().split('\n')[0]

	# Ensure the text starts and ends with quotes only if it doesn't already
	correctly_quoted_text = ensure_correct_quoting(clean_text)

	# Append the clean response text to the CSV
	with open(csv_file, mode='a', newline='', encoding='utf-8') as file:
	writer = csv.writer(file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
	writer.writerow([correctly_quoted_text, idx])

	row_count += 1
	print(f"Response for sentiment '{sentiment}' saved to {csv_file}. Total rows: {row_count}")

	if row_count >= 100000:
	break

	print("All responses saved. Total rows:", row_count)