File size: 1,904 Bytes
93a5bf9
5c271a3
ef9fa4b
 
7a3f093
ef9fa4b
a6a0614
ef9fa4b
7a3f093
5c271a3
ef9fa4b
5c271a3
a6a0614
 
5c271a3
7a3f093
 
ef9fa4b
7a3f093
 
ef9fa4b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
93a5bf9
 
 
 
 
 
ef9fa4b
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
import pandas as pd
from src.genai.utils.data_loader import caption_df
from src.genai.utils.models_loader import llm_gpt
from .prompts import details_extract_prompt
from langchain_core.messages import SystemMessage, HumanMessage
from .state import DetailsFormatter
from langsmith import traceable

class DetailsExtractorNode:
    def __init__(self, interactions):
        self.llm = llm_gpt
        self.interactions = interactions
    
    @traceable(name="details extraction")
    def run(self):
        template = details_extract_prompt()
        messages = [SystemMessage(content=template), HumanMessage(content=str(self.interactions))]
        response=llm_gpt.with_structured_output(DetailsFormatter).invoke(messages)
        return response.model_dump()


class SaveToDB:
    def __init__(self, caption_df):
        self.df = caption_df.drop(columns=['embeddings'], errors='ignore')

    def _prepare_values(self, business_details):
        """Extract lowercase string values from business_details dict."""
        all_values = set()
        for v in business_details.values():
            if isinstance(v, str):
                all_values.add(v.lower())
            elif isinstance(v, list):
                all_values.update(map(str.lower, map(str, v)))
        return all_values

    def _row_matches(self, row, all_values):
        """Check if any value in all_values exists in the row."""
        return any(
            str(cell).lower().find(val) != -1
            for cell in row
            for val in all_values
        )

    def save_to_csv(self, business_details, output_file='extracted_data.csv'):
        """Filter dataframe rows based on business_details and save to CSV."""
        all_values = self._prepare_values(business_details)
        matched_df = self.df[self.df.apply(self._row_matches, axis=1, args=(all_values,))]
        matched_df.to_csv(output_file, index=False)