Spaces:

pandalow
/

word_cloud_generator

Sleeping

App Files Files Community

word_cloud_generator / app.py

pandalow

fix: font

cb0b221 8 months ago

raw

history blame contribute delete

2.61 kB

	import re
	import os
	import requests

	import jieba
	from PIL import Image
	from wordcloud import WordCloud
	import gradio as gr
	from typing import List

	FONT_PATH = "NotoSansCJK-Regular.otf"
	FONT_URL = "https://github.com/adobe-fonts/source-han-sans/raw/release/OTF/SimplifiedChinese/SourceHanSansSC-Regular.otf"


	if not os.path.exists(FONT_PATH):
	print("Downloading font...")
	with open(FONT_PATH, "wb") as f:
	f.write(requests.get(FONT_URL).content)

	def tokenize_text(text:str) -> List:
	"""

	Tokenize the text by utilising the regex, and then segment the text by jieba.
	Args:
	text: a string mixed with chinese and english.
	Returns:
	a list of tokens.
	"""
	compiler = re.findall(r'[\u4e00-\u9fa5]+\|[a-zA-Z0-9]+',text)

	segmented = []
	for token in compiler:
	if re.match(r'[\u4e00-\u9fa5]', token):
	segmented += jieba.cut(token)
	else:
	segmented.append(token.lower())

	return segmented


	def generate_world_cloud(text:str) -> Image.Image:
	"""

	Generate a world cloud from the text.
	Args:
	text: a string mixed with chinese and english.
	Returns:
	Image: PIL.Image.Image
	"""

	segmented = tokenize_text(text)

	wc = WordCloud(
	font_path=FONT_PATH,
	width=800,
	height=400,
	background_color='white'
	)

	return wc.generate(" ".join(segmented)).to_image()

	def generate_word_cloud_from_file(file:str) -> Image.Image:
	"""
	Generate a word cloud from a file.
	Args:
	file: a file path.
	Returns:
	Image: PIL.Image.Image
	"""
	text = open(file, "r", encoding="utf-8").read()
	return generate_world_cloud(text)

	with gr.Blocks(title="World Cloud Generator") as demo:
	gr.Markdown("## Word Cloud Generator\nSupports mixed Chinese-English input via text or .txt file upload.")
	with gr.Tab("Text Input"):
	with gr.Row():
	text_input = gr.Textbox(lines=10, label="Input texts")
	text_button = gr.Button("Generate Word Cloud")
	text_output = gr.Image(type="pil", label="world cloud")
	text_button.click(generate_world_cloud, inputs=text_input, outputs=text_output)
	with gr.Tab("File Input"):
	with gr.Row():
	file_input = gr.File(label="Upload .txt file")
	file_button = gr.Button("Generate Word Cloud")
	file_output = gr.Image(type="pil", label="world cloud")
	file_button.click(generate_word_cloud_from_file, inputs=file_input, outputs=file_output)


	demo.launch()