File size: 4,746 Bytes
d488241 f788a29 d488241 092e58d d488241 f788a29 d488241 f788a29 d488241 f788a29 d488241 f788a29 d488241 f788a29 d488241 f788a29 d488241 f788a29 d488241 f788a29 d488241 f788a29 d488241 f788a29 d488241 f788a29 d488241 f788a29 d488241 f788a29 d488241 f788a29 d488241 f788a29 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 |
from pathlib import Path
import logging
import json
from transformers import pipeline
from textblob import TextBlob
import spacy
import re
logger = logging.getLogger(__name__)
class ProcessingError(Exception):
"""Exception raised when ad processing fails."""
pass
class AIPipeline:
def __init__(self):
"""Initialize the AI pipeline with necessary models."""
try:
# Load spaCy model for NER and topic extraction
self.nlp = spacy.load('en_core_web_sm')
# Initialize sentiment analyzer
self.sentiment = pipeline('sentiment-analysis', model='distilbert-base-uncased-finetuned-sst-2-english')
logger.info("AI Pipeline initialized successfully")
except Exception as e:
logger.error(f"Error initializing AI Pipeline: {e}")
raise
def _analyze_sentiment(self, text: str) -> float:
"""Analyze sentiment of text and return a score between -1 and 1."""
try:
# Use transformers for initial sentiment
result = self.sentiment(text)[0]
# Convert POSITIVE/NEGATIVE to float
if result['label'] == 'POSITIVE':
score = result['score']
else:
score = -result['score']
# Use TextBlob for additional nuance
blob = TextBlob(text)
blob_score = blob.sentiment.polarity
# Average the scores
final_score = (score + blob_score) / 2
return final_score
except Exception as e:
logger.error(f"Error in sentiment analysis: {e}")
return 0.0
def _extract_topics(self, text: str) -> list:
"""Extract main topics from text."""
try:
doc = self.nlp(text)
# Extract noun phrases as potential topics
noun_phrases = [chunk.text.lower() for chunk in doc.noun_chunks]
# Extract named entities that might be topics
entities = [ent.text.lower() for ent in doc.ents
if ent.label_ in ['ORG', 'PRODUCT', 'EVENT', 'WORK_OF_ART']]
# Combine and clean topics
all_topics = noun_phrases + entities
# Clean and filter topics
cleaned_topics = []
for topic in all_topics:
# Remove special characters and extra whitespace
topic = re.sub(r'[^\w\s]', '', topic)
topic = ' '.join(topic.split())
# Filter out short or common words
if len(topic) > 3 and topic not in ['the', 'this', 'that', 'these', 'those']:
cleaned_topics.append(topic)
# Remove duplicates and limit to top 5
unique_topics = list(set(cleaned_topics))
return sorted(unique_topics)[:5]
except Exception as e:
logger.error(f"Error in topic extraction: {e}")
return []
def _extract_entities(self, text: str) -> list:
"""Extract named entities from text."""
try:
doc = self.nlp(text)
entities = []
for ent in doc.ents:
entity = {
'text': ent.text,
'type': ent.label_,
'description': spacy.explain(ent.label_)
}
entities.append(entity)
return entities
except Exception as e:
logger.error(f"Error in entity extraction: {e}")
return []
def process_ad(self, ad) -> dict:
"""Process an ad and return analysis results."""
try:
# Ensure we have content to analyze
if not hasattr(ad, 'content') or not ad.content:
return {
'sentiment': 0.0,
'topics': [],
'entities': []
}
# Analyze sentiment
sentiment = self._analyze_sentiment(ad.content)
# Extract topics
topics = self._extract_topics(ad.content)
# Extract entities
entities = self._extract_entities(ad.content)
return {
'sentiment': sentiment,
'topics': topics,
'entities': entities
}
except Exception as e:
logger.error(f"Error in ad processing: {e}")
return {
'sentiment': 0.0,
'topics': [],
'entities': []
} |