File size: 5,129 Bytes
0e759d2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
import { logger } from "../../../lib/logger";
import { areMergeable } from "./merge-null-val-objs";
import { transformArrayToObject } from "./transform-array-to-obj";

interface TransformedResult {
  transformed: { [key: string]: any[] } | any[];
  url: string;
}

/**
 * Tracks sources through the transformation, deduplication, and merging process
 */
export class SourceTracker {
  private transformedResults: TransformedResult[];
  private preDedupeSourceMap: Map<string, string[]>;

  constructor() {
    this.transformedResults = [];
    this.preDedupeSourceMap = new Map();
  }

  /**
   * Transform raw extraction results into a format that preserves source information
   */
  transformResults(extractionResults: { extract: any; url: string }[], schema: any, withTransform: boolean = true) {
    // Handle array outputs
    if (Array.isArray(extractionResults[0]?.extract)) {
      this.transformedResults = extractionResults.map(result => ({
        transformed: result.extract,
        url: result.url
      }));
      
      if (withTransform) {
        // Combine all extracts to match original behavior
        const combinedExtracts = extractionResults.map(r => r.extract).flat();
        return combinedExtracts;
      }
      return this.transformedResults;
    }

    // Handle object outputs (original behavior)
    this.transformedResults = extractionResults.map(result => ({
      transformed: transformArrayToObject(schema, [result.extract]),
      url: result.url
    }));

    if (withTransform) {
      // Then combine all extracts and transform them together to match original behavior
      const combinedExtracts = extractionResults.map(r => r.extract);
      return transformArrayToObject(schema, combinedExtracts);
    }
    return this.transformedResults;
  }

  /**
   * Track sources for each item before deduplication
   */
  trackPreDeduplicationSources(multiEntityResult: { [key: string]: any[] } | any[]) {
    try {
      if (Array.isArray(multiEntityResult)) {
        // Handle array outputs
        multiEntityResult.forEach((item: any) => {
          const itemKey = JSON.stringify(item);
          const matchingSources = this.transformedResults
            .filter(result => 
              Array.isArray(result.transformed) && 
              result.transformed.some((resultItem: any) => 
                JSON.stringify(resultItem) === itemKey
              )
            )
            .map(result => result.url);
          this.preDedupeSourceMap.set(itemKey, matchingSources);
        });
      } else {
        // Handle object outputs (original behavior)
        Object.keys(multiEntityResult).forEach(key => {
          multiEntityResult[key].forEach((item: any) => {
            const itemKey = JSON.stringify(item);
            const matchingSources = this.transformedResults
              .filter(result => 
                result.transformed[key]?.some((resultItem: any) => 
                  JSON.stringify(resultItem) === itemKey
                )
              )
              .map(result => result.url);
            this.preDedupeSourceMap.set(itemKey, matchingSources);
          });
        });
      }
    } catch (error) {
      logger.error(`Failed to track pre-deduplication sources`, { error });
    }
  }

  /**
   * Map sources to final deduplicated/merged items
   */
  mapSourcesToFinalItems(
    multiEntityResult: { [key: string]: any[] } | any[],
    multiEntityKeys: string[]
  ): Record<string, string[]> {
    try {
      const sources: Record<string, string[]> = {};

      if (Array.isArray(multiEntityResult)) {
        // Handle array outputs
        multiEntityResult.forEach((item: any, finalIndex: number) => {
          const sourceKey = `[${finalIndex}]`;
          const itemSources = new Set<string>();

          this.transformedResults.forEach(result => {
            if (Array.isArray(result.transformed)) {
              result.transformed.forEach((originalItem: any) => {
                if (areMergeable(item, originalItem)) {
                  itemSources.add(result.url);
                }
              });
            }
          });

          sources[sourceKey] = Array.from(itemSources);
        });
      } else {
        // Handle object outputs (original behavior)
        multiEntityKeys.forEach(key => {
          if (multiEntityResult[key] && Array.isArray(multiEntityResult[key])) {
            multiEntityResult[key].forEach((item: any, finalIndex: number) => {
              const sourceKey = `${key}[${finalIndex}]`;
              const itemSources = new Set<string>();

              this.transformedResults.forEach(result => {
                result.transformed[key]?.forEach((originalItem: any) => {
                  if (areMergeable(item, originalItem)) {
                    itemSources.add(result.url);
                  }
                });
              });

              sources[sourceKey] = Array.from(itemSources);
            });
          }
        });
      }

      return sources;
    } catch (error) {
      logger.error(`Failed to map sources to final items`, { error });
      return {};
    }
  }
}