device_price_detector / description_extractor.py
palakmathur's picture
first
8dc12f7
import re
import torch
from transformers import pipeline
#2
class DescriptionExtractor:
def __init__(self):
self.summarizer = pipeline(
"summarization",
model="facebook/bart-large-cnn"
)
self.part_keywords = [
"screen", "display", "glass", "battery", "power",
"charging port", "port", "hinge", "keyboard", "keys",
"speaker", "audio", "microphone", "body", "frame",
"casing", "lid", "touchpad", "camera"
]
self.symptom_keywords = [
"crack", "broken", "damage", "not working", "loose",
"drain", "hot", "overheat", "scratch", "dent",
"bent", "water", "liquid", "sound", "audio"
]
def extract(self, description):
if not description or len(description.strip()) < 5:
return {
'original': description,
'summary': description,
'affected_parts': [],
'symptoms': [],
'keywords': [],
'length_category': 'none'
}
desc_lower = description.lower()
word_count = len(description.split())
if word_count < 10:
length_category = 'short'
summary = description
elif word_count < 50:
length_category = 'medium'
summary = description
else:
length_category = 'long'
try:
summary_result = self.summarizer(
description,
max_length=50,
min_length=10,
do_sample=False
)
summary = summary_result[0]['summary_text']
except:
summary = ' '.join(description.split()[:40]) + "..."
affected_parts = [
part for part in self.part_keywords
if part in desc_lower
]
symptoms = [
symptom for symptom in self.symptom_keywords
if symptom in desc_lower
]
keywords = list(set(affected_parts + symptoms))
return {
'original': description,
'summary': summary,
'affected_parts': affected_parts,
'symptoms': symptoms,
'keywords': keywords,
'length_category': length_category,
'word_count': word_count
}
def create_search_text(self, description_info):
if not description_info['keywords']:
return description_info['summary']
search_text = f"{description_info['summary']} {' '.join(description_info['keywords'])}"
return search_text