Spaces:
Sleeping
Sleeping
| # サンプルメールデータの生成スクリプト | |
| import os | |
| import json | |
| import random | |
| import pandas as pd | |
| import numpy as np | |
| # データセットの出力先 | |
| OUTPUT_DIR = os.path.join(os.path.dirname(os.path.dirname(__file__)), "data") | |
| os.makedirs(OUTPUT_DIR, exist_ok=True) | |
| # テンプレートのパス | |
| TEMPLATES_PATH = os.path.join(OUTPUT_DIR, "templates.json") | |
| # メールテンプレート | |
| email_templates = { | |
| "legitimate": { | |
| "ja": [ | |
| "【研究】{RESEARCH_TOPIC}に関する問い合わせ\n\n{GREETING}、\n\n私は{UNIVERSITY}の{SENDER_NAME}と申します。現在、{RESEARCH_TOPIC}について研究しており、貴{ORGANIZATION}の{SPECIFIC_INTEREST}に関心があります。\n\n{QUESTION}\n\n{CLOSING}\n{SENDER_NAME}", | |
| "{GREETING}\n\n{RESEARCH_TOPIC}に関する情報をお願いしたいと思います。{SPECIFIC_INTEREST}について詳細を知りたいのですが、資料などございますでしょうか。\n\n{CLOSING}\n{SENDER_NAME}({UNIVERSITY})", | |
| "件名:{SPECIFIC_INTEREST}について\n\n{GREETING}\n{ORGANIZATION}ご担当者様\n\n{UNIVERSITY}の{SENDER_NAME}です。{RESEARCH_TOPIC}の研究プロジェクトを進めています。\n{QUESTION}\n\n{CLOSING}" | |
| ], | |
| "en": [ | |
| "Research Inquiry: {RESEARCH_TOPIC}\n\n{GREETING},\n\nMy name is {SENDER_NAME} from {UNIVERSITY}. I am currently researching {RESEARCH_TOPIC} and I'm interested in your {SPECIFIC_INTEREST}.\n\n{QUESTION}\n\n{CLOSING},\n{SENDER_NAME}", | |
| "Information Request: {SPECIFIC_INTEREST}\n\n{GREETING},\n\nI am writing to request information about {SPECIFIC_INTEREST} for my research on {RESEARCH_TOPIC}.\n\n{QUESTION}\n\n{CLOSING},\n{SENDER_NAME}\n{UNIVERSITY}", | |
| "Subject: Inquiry about {RESEARCH_TOPIC}\n\n{GREETING} {ORGANIZATION},\n\nI am {SENDER_NAME}, a researcher at {UNIVERSITY}. I would like to inquire about {SPECIFIC_INTEREST} as part of my study on {RESEARCH_TOPIC}.\n\n{QUESTION}\n\n{CLOSING}" | |
| ] | |
| }, | |
| "spam": { | |
| "ja": [ | |
| "【緊急】{PROMOTION}のお知らせ\n\n{GREETING}様\n\n{COMPANY}の{SENDER_NAME}です。この度、弊社では{PRODUCT}の{PROMOTION}を実施しております。{SALES_PITCH}\n\n{CALL_TO_ACTION}\n\n{URGENCY}\n\n{COMPANY}\n{SENDER_NAME}", | |
| "【最終】{PROMOTION}は本日まで!\n\n{GREETING}様\n\n{SALES_PITCH}今なら{PRODUCT}が{PROMOTION}になっています。この機会をお見逃しなく!\n\n{CALL_TO_ACTION}\n\n{URGENCY}\n\n{COMPANY}営業部", | |
| "{PRODUCT}の導入事例のご紹介\n\n{GREETING}様\n\n{COMPANY}の{SENDER_NAME}です。弊社の{PRODUCT}を導入いただいた{REFERENCE_CUSTOMER}様の成功事例をご紹介します。{SALES_PITCH}\n\n{CALL_TO_ACTION}\n\n{SENDER_NAME}" | |
| ], | |
| "en": [ | |
| "URGENT: {PROMOTION} - Limited Time Offer\n\n{GREETING},\n\nI'm {SENDER_NAME} from {COMPANY}. We are currently offering a special {PROMOTION} on our {PRODUCT}. {SALES_PITCH}\n\n{CALL_TO_ACTION}\n\n{URGENCY}\n\nBest regards,\n{SENDER_NAME}\n{COMPANY}", | |
| "LAST CHANCE: {PROMOTION} Ends Today!\n\n{GREETING},\n\n{SALES_PITCH} Get our {PRODUCT} with {PROMOTION} now. Don't miss out!\n\n{CALL_TO_ACTION}\n\n{URGENCY}\n\n{COMPANY} Sales Team", | |
| "Success Story: {REFERENCE_CUSTOMER} with our {PRODUCT}\n\n{GREETING},\n\nI'm {SENDER_NAME} from {COMPANY}. I wanted to share a success story from {REFERENCE_CUSTOMER} who implemented our {PRODUCT}. {SALES_PITCH}\n\n{CALL_TO_ACTION}\n\nRegards,\n{SENDER_NAME}" | |
| ] | |
| } | |
| } | |
| # 置換用の語句(日本語) | |
| placeholders_ja = { | |
| "{GREETING}": ["お世話になっております", "いつもお世話になっております", "ご無沙汰しております", "初めてメールいたします", "お疲れ様です"], | |
| "{SENDER_NAME}": ["田中", "佐藤", "鈴木", "高橋", "渡辺", "伊藤", "山本", "中村", "小林", "加藤"], | |
| "{UNIVERSITY}": ["東京大学", "京都大学", "大阪大学", "東北大学", "名古屋大学", "九州大学", "北海道大学", "筑波大学", "広島大学", "神戸大学"], | |
| "{ORGANIZATION}": ["研究所", "研究室", "部署", "センター", "学部", "チーム", "グループ", "プロジェクト"], | |
| "{RESEARCH_TOPIC}": ["人工知能", "機械学習", "自然言語処理", "ビッグデータ分析", "IoT", "サイバーセキュリティ", "量子コンピューティング", "バイオインフォマティクス", "ロボティクス", "持続可能エネルギー"], | |
| "{SPECIFIC_INTEREST}": ["研究プロジェクト", "最新の論文", "共同研究の可能性", "研究施設の見学", "研究データの共有", "研究会への参加", "研究助成金", "研究インターンシップ"], | |
| "{QUESTION}": ["詳細情報を提供していただけますか?", "面会の可能性はありますでしょうか?", "共同研究の可能性についてお話しできますか?", "参考文献をご紹介いただけますか?", "研究データを共有していただくことは可能でしょうか?"], | |
| "{CLOSING}": ["よろしくお願いいたします。", "ご検討のほど、よろしくお願いいたします。", "ご回答をお待ちしております。", "お手数ですが、ご連絡いただければ幸いです。", "今後ともよろしくお願いいたします。"], | |
| "{COMPANY}": ["株式会社テクノソリューション", "グローバルITサービス株式会社", "デジタルイノベーション株式会社", "未来システム株式会社", "スマートビジネス株式会社"], | |
| "{PRODUCT}": ["クラウドサービス", "AI分析ツール", "業務効率化システム", "セキュリティソフト", "データ管理プラットフォーム", "オンライン研修プログラム"], | |
| "{PROMOTION}": ["期間限定キャンペーン", "特別割引", "無料トライアル", "早期割引", "パッケージ割引", "限定特典"], | |
| "{SALES_PITCH}": ["御社の業務効率が飛躍的に向上します。", "コスト削減と生産性向上を実現します。", "競合他社との差別化に貢献します。", "導入企業様の満足度は98%です。", "業界トップクラスの機能を提供します。"], | |
| "{CALL_TO_ACTION}": ["詳細資料をご希望の方は返信にてお知らせください。", "下記のリンクからお申し込みいただけます。", "デモのご予約はこちらから。", "担当者が直接ご説明いたしますので、ご連絡ください。", "今すぐお問い合わせください。"], | |
| "{URGENCY}": ["このオファーは今週末までとなります。", "先着30社様限定となります。", "期間限定のため、お早めにお申し込みください。", "在庫限りとなりますので、お急ぎください。", "年度末の特別キャンペーンです。"], | |
| "{REFERENCE_CUSTOMER}": ["A社", "B株式会社", "C大学", "D研究所", "E病院", "F銀行", "G自動車"] | |
| } | |
| # 置換用の語句(英語) | |
| placeholders_en = { | |
| "{GREETING}": ["Dear", "Hello", "Hi", "Greetings", "Good day"], | |
| "{SENDER_NAME}": ["John Smith", "Emily Johnson", "Michael Brown", "Sarah Davis", "Robert Wilson", "Jennifer Lee", "David Martinez", "Lisa Anderson", "James Taylor", "Susan White"], | |
| "{UNIVERSITY}": ["Stanford University", "MIT", "Harvard University", "University of Cambridge", "Oxford University", "University of Tokyo", "University of California", "ETH Zurich", "National University of Singapore", "University of Toronto"], | |
| "{ORGANIZATION}": ["Research Institute", "Laboratory", "Department", "Center", "Faculty", "Team", "Group", "Project"], | |
| "{RESEARCH_TOPIC}": ["Artificial Intelligence", "Machine Learning", "Natural Language Processing", "Big Data Analytics", "IoT", "Cybersecurity", "Quantum Computing", "Bioinformatics", "Robotics", "Sustainable Energy"], | |
| "{SPECIFIC_INTEREST}": ["research projects", "recent publications", "collaboration opportunities", "research facilities", "data sharing", "research conferences", "research grants", "research internships"], | |
| "{QUESTION}": ["Could you provide more information?", "Would it be possible to schedule a meeting?", "Can we discuss potential collaborations?", "Would you recommend any relevant references?", "Is it possible to share research data?"], | |
| "{CLOSING}": ["Thank you for your consideration", "I look forward to your response", "I appreciate your assistance", "Thank you for your time", "Best regards"], | |
| "{COMPANY}": ["TechnoSolutions Inc.", "Global IT Services Ltd.", "Digital Innovation Corp.", "Future Systems Co.", "Smart Business Technologies"], | |
| "{PRODUCT}": ["Cloud Service", "AI Analysis Tool", "Business Efficiency System", "Security Software", "Data Management Platform", "Online Training Program"], | |
| "{PROMOTION}": ["limited-time offer", "special discount", "free trial", "early bird discount", "package deal", "exclusive bonus"], | |
| "{SALES_PITCH}": ["This will dramatically improve your business efficiency.", "Our solution reduces costs and increases productivity.", "Stay ahead of your competition with our solution.", "We have a 98% customer satisfaction rate.", "We offer industry-leading features."], | |
| "{CALL_TO_ACTION}": ["Reply for more information.", "Click the link below to sign up.", "Book a demo now.", "Contact us for a personal consultation.", "Inquire now."], | |
| "{URGENCY}": ["This offer ends this weekend.", "Limited to the first 30 companies.", "Act now before the promotion ends.", "While supplies last.", "Special end-of-year campaign."], | |
| "{REFERENCE_CUSTOMER}": ["Company A", "Corporation B", "University C", "Research Institute D", "Hospital E", "Bank F", "Automotive G"] | |
| } | |
| # テンプレートをJSONファイルとして保存 | |
| templates_data = { | |
| "templates": email_templates, | |
| "placeholders": { | |
| "ja": placeholders_ja, | |
| "en": placeholders_en | |
| } | |
| } | |
| with open(TEMPLATES_PATH, 'w', encoding='utf-8') as f: | |
| json.dump(templates_data, f, ensure_ascii=False, indent=2) | |
| print(f"テンプレートを {TEMPLATES_PATH} に保存しました。") | |
| # プレースホルダーをランダムに置換する関数 | |
| def fill_template(template, placeholders): | |
| filled = template | |
| for placeholder, options in placeholders.items(): | |
| if placeholder in filled: | |
| replacement = random.choice(options) | |
| filled = filled.replace(placeholder, replacement) | |
| return filled | |
| # サンプルメールの生成 | |
| def generate_sample_emails(num_legitimate=50, num_spam=50): | |
| emails = [] | |
| labels = [] | |
| languages = [] | |
| # 正当な問い合わせメール(日本語) | |
| for _ in range(num_legitimate // 2): | |
| template = random.choice(email_templates["legitimate"]["ja"]) | |
| email = fill_template(template, placeholders_ja) | |
| emails.append(email) | |
| labels.append(0) # 0: 正当な問い合わせ | |
| languages.append("ja") | |
| # 正当な問い合わせメール(英語) | |
| for _ in range(num_legitimate // 2): | |
| template = random.choice(email_templates["legitimate"]["en"]) | |
| email = fill_template(template, placeholders_en) | |
| emails.append(email) | |
| labels.append(0) # 0: 正当な問い合わせ | |
| languages.append("en") | |
| # 営業・スパムメール(日本語) | |
| for _ in range(num_spam // 2): | |
| template = random.choice(email_templates["spam"]["ja"]) | |
| email = fill_template(template, placeholders_ja) | |
| emails.append(email) | |
| labels.append(1) # 1: 営業・スパム | |
| languages.append("ja") | |
| # 営業・スパムメール(英語) | |
| for _ in range(num_spam // 2): | |
| template = random.choice(email_templates["spam"]["en"]) | |
| email = fill_template(template, placeholders_en) | |
| emails.append(email) | |
| labels.append(1) # 1: 営業・スパム | |
| languages.append("en") | |
| # DataFrameの作成 | |
| df = pd.DataFrame({ | |
| "email_text": emails, | |
| "label": labels, | |
| "language": languages | |
| }) | |
| # ランダムにシャッフル | |
| df = df.sample(frac=1).reset_index(drop=True) | |
| return df | |
| # サンプルメールの生成と保存 | |
| sample_df = generate_sample_emails(num_legitimate=50, num_spam=50) | |
| sample_path = os.path.join(OUTPUT_DIR, "sample_emails.csv") | |
| sample_df.to_csv(sample_path, index=False) | |
| print(f"サンプルメールを {sample_path} に保存しました。") | |
| print(f"データ件数: {len(sample_df)}件") | |
| print(f" - 正当な問い合わせ: {sum(sample_df['label'] == 0)}件") | |
| print(f" - 営業・スパム: {sum(sample_df['label'] == 1)}件") | |
| print(f" - 日本語: {sum(sample_df['language'] == 'ja')}件") | |
| print(f" - 英語: {sum(sample_df['language'] == 'en')}件") |