Spaces:

goodmodeler
/

AdGPT

Sleeping

App Files Files Community

goodmodeler commited on Aug 17

Commit

a7af970

1 Parent(s): c99bc7a

UPDATE: rag

Browse files

Files changed (1) hide show

retrieval_augmented_generation/build_embeddings.py +198 -213

retrieval_augmented_generation/build_embeddings.py CHANGED Viewed

@@ -1,261 +1,246 @@
 #!/usr/bin/env python3
 """
-使用BERT + FAISS构建产品描述和Slogan的嵌入数据库
-支持相似性搜索和检索
 """
-import faiss
 import numpy as np
-import pandas as pd
 from sentence_transformers import SentenceTransformer
 from datasets import Dataset
-import pickle
-import json
-from typing import List, Dict, Tuple
-import os
-class SloganEmbeddingDB:
-    def __init__(self, model_name: str = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"):
-        """
-        初始化BERT+FAISS数据库
-        Args:
-            model_name: 多语言BERT模型，支持中英文
-        """
-        print(f"📥 Loading BERT model: {model_name}")
-        self.model = SentenceTransformer(model_name)
-        self.dimension = self.model.get_sentence_embedding_dimension()
-        # 初始化FAISS索引
-        self.index = faiss.IndexFlatIP(self.dimension)  # 内积相似度
-        self.data = []  # 存储原始数据
-        print(f"✅ Model loaded. Embedding dimension: {self.dimension}")
-    def create_sample_dataset(self) -> Dataset:
-        """创建示例数据集"""
-        sample_data = [
-            # 中文品牌
-            {"business": "肯德基", "category": "快餐", "description": "美式炸鸡快餐连锁", "slogan": "有了肯德基生活好滋味"},
-            {"business": "麦当劳", "category": "快餐", "description": "全球知名汉堡快餐", "slogan": "我就喜欢"},
-            {"business": "星巴克", "category": "咖啡", "description": "全球连锁咖啡店", "slogan": "启发并滋润人类精神"},
-            {"business": "小米", "category": "电子产品", "description": "智能手机和科技产品", "slogan": "让每个人都能享受科技的乐趣"},
-            {"business": "华为", "category": "电子产品", "description": "通信设备和智能手机", "slogan": "构建万物互联的智能世界"},
-            # 英文品牌
-            {"business": "Nike", "category": "运动用品", "description": "Athletic footwear and apparel", "slogan": "Just Do It"},
-            {"business": "Apple", "category": "科技", "description": "Consumer electronics and software", "slogan": "Think Different"},
-            {"business": "Coca-Cola", "category": "饮料", "description": "Carbonated soft drinks", "slogan": "Open Happiness"},
-            {"business": "BMW", "category": "汽车", "description": "Luxury automobiles", "slogan": "The Ultimate Driving Machine"},
-            {"business": "Amazon", "category": "电商", "description": "E-commerce and cloud services", "slogan": "Earth's Most Customer-Centric Company"},
-            # 产品描述
-            {"business": "智能手表", "category": "可穿戴设备", "description": "健康监测和通知功能的智能手表", "slogan": "时刻关注您的健康"},
-            {"business": "电动汽车", "category": "新能源汽车", "description": "零排放环保电动车", "slogan": "绿色出行，智享未来"},
-            {"business": "在线教育平台", "category": "教育科技", "description": "AI驱动的个性化学习平台", "slogan": "让学习更智能"},
-            {"business": "健身APP", "category": "健康应用", "description": "AI私教健身指导应用", "slogan": "随时随地，专业健身"},
-            {"business": "外卖平台", "category": "生活服务", "description": "快速便捷的餐食配送服务", "slogan": "美食到家，生活更美好"},
         ]
-        return Dataset.from_pandas(pd.DataFrame(sample_data))
-    def build_embeddings(self, dataset: Dataset):
-        """构建嵌入向量并建立FAISS索引"""
-        print("🔨 Building embeddings and FAISS index...")
-        # 准备数据
-        texts = []
-        for item in dataset:
-            # 组合文本：业务名称 + 类别 + 描述
-            combined_text = f"{item['business']} {item['category']} {item['description']}"
-            texts.append(combined_text)
-            # 保存原始数据
-            self.data.append({
-                "business": item["business"],
-                "category": item["category"],
-                "description": item["description"],
-                "slogan": item["slogan"],
-                "combined_text": combined_text
-            })
-        # 生成嵌入向量
-        print(f"📊 Generating embeddings for {len(texts)} items...")
-        embeddings = self.model.encode(texts, show_progress_bar=True)
-        # 标准化向量（用于余弦相似度）
-        embeddings = embeddings / np.linalg.norm(embeddings, axis=1, keepdims=True)
-        # 添加到FAISS索引
         self.index.add(embeddings.astype('float32'))
-        print(f"✅ Built FAISS index with {self.index.ntotal} vectors")
-    def search_similar(self, query: str, top_k: int = 5) -> List[Dict]:
-        """搜索相似的业务描述"""
-        print(f"🔍 Searching for: '{query}'")
-        # 生成查询向量
-        query_embedding = self.model.encode([query])
-        query_embedding = query_embedding / np.linalg.norm(query_embedding, axis=1, keepdims=True)
-        # FAISS搜索
-        scores, indices = self.index.search(query_embedding.astype('float32'), top_k)
-        # 整理结果
         results = []
-        for i, (score, idx) in enumerate(zip(scores[0], indices[0])):
-            if idx < len(self.data):
-                result = self.data[idx].copy()
-                result["similarity_score"] = float(score)
-                result["rank"] = i + 1
                 results.append(result)
         return results
-    def save_database(self, save_path: str = "./slogan_db"):
         """保存数据库"""
-        os.makedirs(save_path, exist_ok=True)
         # 保存FAISS索引
-        faiss.write_index(self.index, f"{save_path}/faiss.index")
-        # 保存数据
-        with open(f"{save_path}/data.pkl", "wb") as f:
-            pickle.dump(self.data, f)
-        # 保存配置
-        config = {
-            "model_name": self.model._modules['0'].auto_model.config.name_or_path,
-            "dimension": self.dimension,
-            "total_items": len(self.data)
-        }
-        with open(f"{save_path}/config.json", "w", encoding="utf-8") as f:
-            json.dump(config, f, ensure_ascii=False, indent=2)
-        print(f"💾 Database saved to {save_path}")
-    def load_database(self, load_path: str = "./slogan_db"):
         """加载数据库"""
-        print(f"📂 Loading database from {load_path}")
-        # 加载FAISS索引
-        self.index = faiss.read_index(f"{load_path}/faiss.index")
-        # 加载数据
-        with open(f"{load_path}/data.pkl", "rb") as f:
-            self.data = pickle.load(f)
-        print(f"✅ Loaded database with {len(self.data)} items")
-    def add_new_item(self, business: str, category: str, description: str, slogan: str):
-        """动态添加新项目"""
-        combined_text = f"{business} {category} {description}"
-        # 生成嵌入
-        embedding = self.model.encode([combined_text])
-        embedding = embedding / np.linalg.norm(embedding, axis=1, keepdims=True)
-        # 添加到索引
-        self.index.add(embedding.astype('float32'))
-        # 添加到数据
-        self.data.append({
-            "business": business,
-            "category": category,
-            "description": description,
-            "slogan": slogan,
-            "combined_text": combined_text
-        })
-        print(f"➕ Added new item: {business}")
-    def generate_slogan_suggestions(self, business_description: str, top_k: int = 3) -> List[str]:
-        """根据业务描述生成Slogan建议"""
-        similar_items = self.search_similar(business_description, top_k)
-        suggestions = []
-        for item in similar_items:
-            suggestions.append({
-                "slogan": item["slogan"],
-                "reference": f"{item['business']} ({item['category']})",
-                "similarity": item["similarity_score"]
-            })
-        return suggestions
-def main():
-    """主函数演示"""
-    # 初始化数据库
-    db = SloganEmbeddingDB()
-    # 创建或加载数据
-    if os.path.exists("./slogan_db"):
-        print("📂 Found existing database, loading...")
-        db.load_database()
-    else:
-        print("🆕 Creating new database...")
-        dataset = db.create_sample_dataset()
-        db.build_embeddings(dataset)
-        db.save_database()
     # 测试搜索
     test_queries = [
-        "智能穿戴设备健康监测",
-        "环保新能源汽车",
-        "人工智能学习平台",
-        "美式快餐炸鸡",
-        "luxury sports car",
-        "mobile phone technology"
     ]
-    print("\n" + "="*60)
-    print("🔍 SEARCH RESULTS")
-    print("="*60)
     for query in test_queries:
-        print(f"\n🔍 Query: {query}")
-        results = db.search_similar(query, top_k=3)
-        for result in results:
-            print(f"  {result['rank']}. {result['business']} ({result['category']})")
-            print(f"     描述: {result['description']}")
-            print(f"     Slogan: {result['slogan']}")
-            print(f"     相似度: {result['similarity_score']:.3f}")
             print()
-    # 测试Slogan生成建议
-    print("\n" + "="*60)
-    print("💡 SLOGAN SUGGESTIONS")
-    print("="*60)
-    new_business = "AI智能音箱语音助手设备"
-    print(f"\n💡 为 '{new_business}' 生成Slogan建议:")
-    suggestions = db.generate_slogan_suggestions(new_business)
-    for i, suggestion in enumerate(suggestions, 1):
-        print(f"  {i}. \"{suggestion['slogan']}\"")
-        print(f"     参考: {suggestion['reference']}")
-        print(f"     相似度: {suggestion['similarity']:.3f}")
-        print()
-    # 演示动态添加
-    print("\n" + "="*60)
-    print("➕ ADDING NEW ITEM")
-    print("="*60)
-    db.add_new_item(
-        business="智能眼镜",
-        category="AR设备",
-        description="增强现实智能眼镜产品",
-        slogan="看见未来，触手可及"
-    )
-    # 重新搜索测试
-    print(f"\n🔍 搜索 'AR增强现实产品':")
-    results = db.search_similar("AR增强现实产品", top_k=2)
-    for result in results:
-        print(f"  - {result['business']}: {result['slogan']} (相似度: {result['similarity_score']:.3f})")
 if __name__ == "__main__":
     main()

 #!/usr/bin/env python3
 """
+简洁版BERT+FAISS标语数据库
+输入：产品/业务描述
+输出：匹配的广告标语
 """
 import numpy as np
+import faiss
+import json
 from sentence_transformers import SentenceTransformer
 from datasets import Dataset
+import pandas as pd
+class SloganDatabase:
+    def __init__(self):
+        self.encoder = SentenceTransformer('all-MiniLM-L6-v2')
+        self.index = None
+        self.slogans = []
+    def create_dataset(self):
+        """创建标语数据集 - 珠宝首饰奢侈品领域"""
+        # 示例数据：[品牌, 类别, 描述, 标语]
+        data = [
+            # 顶级珠宝品牌
+            ["Tiffany & Co.", "jewelry", "luxury diamond jewelry and engagement rings", "A Diamond is Forever"],
+            ["Cartier", "luxury_jewelry", "high-end jewelry watches and accessories", "L'art de vivre"],
+            ["Van Cleef & Arpels", "jewelry", "French luxury jewelry and watches", "Poetry of Time"],
+            ["Harry Winston", "jewelry", "rare diamonds and luxury jewelry", "Rare Jewels of the World"],
+            ["Bulgari", "jewelry", "Italian luxury jewelry and watches", "Italian Excellence"],
+            ["Chopard", "jewelry", "Swiss luxury jewelry and watches", "Happy Diamonds"],
+            ["Graff", "jewelry", "exceptional diamonds and jewelry", "The Most Fabulous Jewels in the World"],
+            ["Piaget", "jewelry", "Swiss luxury watches and jewelry", "Possession"],
+            ["Boucheron", "jewelry", "French high jewelry and luxury watches", "Le Joaillier Depuis 1858"],
+            ["Mikimoto", "jewelry", "cultured pearl jewelry", "The Originator of Cultured Pearls"],
+            # 奢侈品牌
+            ["Louis Vuitton", "luxury_fashion", "luxury leather goods and fashion", "The Art of Travel"],
+            ["Hermès", "luxury_fashion", "French luxury goods and accessories", "Luxury in the making"],
+            ["Chanel", "luxury_fashion", "haute couture and luxury fashion", "Inside every woman there is a flower and a cat"],
+            ["Gucci", "luxury_fashion", "Italian luxury fashion and accessories", "Quality is remembered long after price is forgotten"],
+            ["Prada", "luxury_fashion", "Italian luxury fashion house", "Prada"],
+            ["Dior", "luxury_fashion", "French luxury fashion and beauty", "Miss Dior"],
+            ["Versace", "luxury_fashion", "Italian luxury fashion design", "Virtus"],
+            ["Saint Laurent", "luxury_fashion", "French luxury fashion house", "Saint Laurent Paris"],
+            ["Balenciaga", "luxury_fashion", "Spanish luxury fashion house", "Balenciaga"],
+            ["Bottega Veneta", "luxury_fashion", "Italian luxury leather goods", "When your own initials are enough"],
+            # 腕表品牌
+            ["Rolex", "luxury_watches", "Swiss luxury watches and timepieces", "Perpetual, Spirit of Excellence"],
+            ["Patek Philippe", "luxury_watches", "Swiss luxury watch manufacturer", "You never actually own a Patek Philippe"],
+            ["Audemars Piguet", "luxury_watches", "Swiss luxury watch brand", "To break the rules, you must first master them"],
+            ["Omega", "luxury_watches", "Swiss luxury watch manufacturer", "Precision"],
+            ["TAG Heuer", "luxury_watches", "Swiss luxury watches", "Don't crack under pressure"],
+            ["Breitling", "luxury_watches", "Swiss luxury watchmaker", "Instruments for Professionals"],
+            ["IWC", "luxury_watches", "Swiss luxury watch company", "Engineered for men"],
+            ["Jaeger-LeCoultre", "luxury_watches", "Swiss luxury watch manufacturer", "The World's Most Complicated Watches"],
+            ["Vacheron Constantin", "luxury_watches", "Swiss luxury watch manufacturer", "One of Not Many"],
+            ["A. Lange & Söhne", "luxury_watches", "German luxury watch manufacturer", "When nothing else will do"],
+            # 时尚首饰
+            ["Pandora", "fashion_jewelry", "Danish jewelry brand charm bracelets", "Be Love"],
+            ["Swarovski", "fashion_jewelry", "Austrian crystal jewelry and accessories", "Unleash Your Light"],
+            ["Daniel Wellington", "fashion_watches", "Swedish watch brand minimalist design", "Live the moment"],
+            ["Alex and Ani", "fashion_jewelry", "American jewelry brand spiritual bracelets", "Positive Energy"],
+            ["Kendra Scott", "fashion_jewelry", "American jewelry designer colorful stones", "Live colorfully"],
+            ["Monica Vinader", "fashion_jewelry", "British jewelry brand contemporary design", "Everyday luxury"],
+            ["Mejuri", "fashion_jewelry", "Canadian jewelry brand everyday luxury", "Everyday fine"],
+            ["Gorjana", "fashion_jewelry", "California jewelry brand layered necklaces", "Live your layer"],
+            ["Kate Spade", "fashion_jewelry", "American fashion accessories jewelry", "Live colorfully"],
+            ["Marc Jacobs", "fashion_jewelry", "American fashion designer accessories", "Marc Jacobs"],
+            # 珠宝定制
+            ["Blue Nile", "diamond_jewelry", "online diamond jewelry retailer", "Extraordinary diamonds for extraordinary moments"],
+            ["James Allen", "diamond_jewelry", "online engagement ring retailer", "See it. Love it. Own it."],
+            ["Brilliant Earth", "diamond_jewelry", "ethical diamond jewelry", "Brilliant Earth"],
+            ["With Clarity", "diamond_jewelry", "lab-grown diamond jewelry", "Diamonds. Redefined."],
+            ["Clean Origin", "diamond_jewelry", "lab-created diamond jewelry", "Grown with love"],
+            ["Ritani", "diamond_jewelry", "engagement rings and wedding bands", "Love is in the details"],
+            ["Vrai", "diamond_jewelry", "lab-grown diamond jewelry", "Created, not mined"],
+            ["Catbird", "jewelry", "Brooklyn-based jewelry designer", "Made in Brooklyn"],
+            ["Wwake", "jewelry", "contemporary fine jewelry designer", "Wwake"],
+            ["Jacquie Aiche", "jewelry", "California jewelry designer bohemian luxury", "Jacquie Aiche"],
+            # 中国珠宝品牌
+            ["周大福", "jewelry", "香港珠宝品牌黄金钻石", "心意足金"],
+            ["周生生", "jewelry", "香港珠宝品牌传统工艺", "传承经典"],
+            ["老凤祥", "jewelry", "中国传统珠宝品牌黄金首饰", "老凤祥，真金不怕火炼"],
+            ["六福珠宝", "jewelry", "香港珠宝品牌时尚设计", "六福临门"],
+            ["潘多拉", "jewelry", "丹麦珠宝品牌串珠手链", "表达你的故事"],
+            ["周大生", "jewelry", "中国珠宝品牌钻石首饰", "爱就在一起"],
+            ["金伯利", "jewelry", "中国钻石珠���品牌", "只为更好的你"],
+            ["戴比尔斯", "diamond_jewelry", "钻石开采珠宝品牌", "钻石恒久远，一颗永流传"],
+            ["施华洛世奇", "crystal_jewelry", "奥地利水晶珠宝品牌", "释放你的光芒"],
+            ["谢瑞麟", "jewelry", "香港珠宝设计师品牌", "艺术珠宝"],
+            # 奢侈品配饰
+            ["Goyard", "luxury_accessories", "French luxury leather goods", "Goyard"],
+            ["Moynat", "luxury_accessories", "French luxury leather goods", "Moynat"],
+            ["Berluti", "luxury_accessories", "French luxury leather goods", "Berluti"],
+            ["Valextra", "luxury_accessories", "Italian luxury leather goods", "Milanese excellence since 1937"],
+            ["Loewe", "luxury_accessories", "Spanish luxury leather goods", "Craft"],
+            ["Brunello Cucinelli", "luxury_fashion", "Italian luxury fashion cashmere", "Humanistic Enterprise"],
+            ["Loro Piana", "luxury_fashion", "Italian luxury textile and clothing", "Excellence in natural fibers"],
+            ["Kiton", "luxury_fashion", "Italian luxury menswear", "The most beautiful thing made by man"],
+            ["Zegna", "luxury_fashion", "Italian luxury menswear", "What makes a man"],
+            ["Brioni", "luxury_fashion", "Italian luxury menswear", "Roman style"],
+            # 新兴奢侈品牌
+            ["Jacquemus", "luxury_fashion", "French luxury fashion house", "La Montagne"],
+            ["Ganni", "luxury_fashion", "Danish fashion brand", "Ganni"],
+            ["Staud", "luxury_fashion", "American fashion brand", "Staud"],
+            ["Cult Gaia", "luxury_accessories", "American accessories brand", "Cult Gaia"],
+            ["Rosantica", "jewelry", "Italian jewelry brand", "Rosantica"],
+            ["Alighieri", "jewelry", "British jewelry brand", "The Inferno"],
+            ["Lizzie Fortunato", "jewelry", "American jewelry brand", "Lizzie Fortunato"],
+            ["Aurate", "jewelry", "American jewelry brand", "Accessible luxury"],
+            ["AUrate New York", "jewelry", "New York jewelry brand", "Radically responsible luxury"],
+            ["Missoma", "jewelry", "British jewelry brand", "Missoma"]
         ]
+        # 转换为DataFrame
+        df = pd.DataFrame(data, columns=['brand', 'category', 'description', 'slogan'])
+        # 创建搜索文本（组合描述信息）
+        df['search_text'] = df['brand'] + ' ' + df['category'] + ' ' + df['description']
+        return df.to_dict('records')
+    def build_index(self, data):
+        """构建FAISS索引"""
+        print("🔨 Building FAISS index...")
+        # 提取搜索文本
+        texts = [item['search_text'] for item in data]
+        # 生成embeddings
+        embeddings = self.encoder.encode(texts, show_progress_bar=True)
+        # 构建索引
+        self.index = faiss.IndexFlatIP(384)  # 使用内积相似度
         self.index.add(embeddings.astype('float32'))
+        # 保存数据
+        self.slogans = data
+        print(f"✅ Index built with {len(data)} slogans")
+    def search(self, query, k=5):
+        """搜索相似标语"""
+        if not self.index:
+            raise ValueError("Index not built yet!")
+        # 编码查询
+        query_embedding = self.encoder.encode([query])
+        # 搜索
+        scores, indices = self.index.search(query_embedding.astype('float32'), k)
+        # 返回结果
         results = []
+        for score, idx in zip(scores[0], indices[0]):
+            if idx < len(self.slogans):
+                result = self.slogans[idx].copy()
+                result['similarity_score'] = float(score)
                 results.append(result)
         return results
+    def save(self, path="slogan_db"):
         """保存数据库"""
         # 保存FAISS索引
+        faiss.write_index(self.index, f"{path}.faiss")
+        # 保存标语数据
+        with open(f"{path}.json", 'w', encoding='utf-8') as f:
+            json.dump(self.slogans, f, ensure_ascii=False, indent=2)
+        print(f"💾 Database saved to {path}")
+    def load(self, path="slogan_db"):
         """加载数据库"""
+        try:
+            # 加载FAISS索引
+            self.index = faiss.read_index(f"{path}.faiss")
+            # 加载标语数据
+            with open(f"{path}.json", 'r', encoding='utf-8') as f:
+                self.slogans = json.load(f)
+            print(f"📂 Database loaded from {path}")
+            return True
+        except:
+            print(f"❌ Failed to load database from {path}")
+            return False
+def main():
+    """主函数"""
+    print("🚀 Creating Slogan Database...")
+    # 初始化
+    db = SloganDatabase()
+    # 尝试加载现有数据库
+    if not db.load():
+        print("📊 Creating new database...")
+        # 创建数据集
+        data = db.create_dataset()
+        # 构建索引
+        db.build_index(data)
+        # 保存数据库
+        db.save()
     # 测试搜索
     test_queries = [
+        "钻石订婚戒指",
+        "奢侈品手袋",
+        "瑞士手表品牌",
+        "珍珠首饰",
+        "黄金项链",
+        "时尚耳环",
+        "luxury jewelry brand",
+        "designer handbag",
+        "crystal accessories",
+        "wedding rings"
     ]
+    print("\n🔍 Testing searches...")
     for query in test_queries:
+        print(f"\n查询: {query}")
+        print("-" * 40)
+        results = db.search(query, k=3)
+        for i, result in enumerate(results, 1):
+            print(f"{i}. {result['brand']} ({result['category']})")
+            print(f"   描述: {result['description']}")
+            print(f"   标语: {result['slogan']}")
+            print(f"   相似度: {result['similarity_score']:.3f}")
             print()
 if __name__ == "__main__":
     main()