Kushalguptaiitb commited on
Commit
f93825f
·
verified ·
1 Parent(s): 09dd0d0

Upload post_process_portfolio_company_json.py

Browse files
post_process_portfolio_company_json.py ADDED
@@ -0,0 +1,387 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ from fuzzywuzzy import fuzz
4
+ from typing import List, Dict, Any
5
+ import yaml
6
+ import warnings
7
+ import pandas as pd
8
+
9
+ # Constants
10
+ # PORTFOLIO_COMPANY_LIST_IDENTIFIER = ["portfolio company or platforms", "portfolio company"]
11
+ PORTFOLIO_COMPANY_LIST_IDENTIFIER = ["portfolio company or platforms","\u20acm","$m","Unrealised fair market valuation","Realised proceeds in the period","Portfolio Company or Platforms","portfolio company", "active investment", "realized/unrealized company","Realized Company","Unrealized Company", "quoted/unquoted company", "portfolio investment", "portfolio company"]
12
+ FUZZY_MATCH_THRESHOLD = 70
13
+ EXCLUDE_COMPANY_NAMES = ["total", "subtotal","Total","Investments","Fund"]
14
+
15
+
16
+ def get_file_name_without_extension(file_path: str) -> str:
17
+ """Extract file name without extension from path."""
18
+ return os.path.splitext(os.path.basename(file_path))[0]
19
+
20
+ def fuzzy_match(text: str, patterns: List[str], threshold: int = FUZZY_MATCH_THRESHOLD) -> bool:
21
+ """Check if text fuzzy matches any of the patterns."""
22
+ text = str(text).lower()
23
+ for pattern in patterns:
24
+ if fuzz.partial_ratio(text, pattern.lower()) >= threshold:
25
+ return True
26
+ return False
27
+
28
+ def extract_portfolio_companies_from_table(table_data: Dict) -> List[str]:
29
+ """Extract company names from a portfolio company table."""
30
+ companies = []
31
+ if not table_data.get("table_info"):
32
+ return companies
33
+
34
+ # Find the company column
35
+ company_column = None
36
+ for i, header in enumerate(table_data.get("table_column_header", [])):
37
+ if fuzzy_match(header, PORTFOLIO_COMPANY_LIST_IDENTIFIER):
38
+ company_column = i
39
+ break
40
+
41
+ if company_column is None:
42
+ return companies
43
+
44
+ # Get the column name that contains companies
45
+ company_column_name = table_data["table_column_header"][company_column]
46
+ print("company_column::",company_column)
47
+ print("cpmpany_column_name::",company_column_name)
48
+
49
+ # Extract companies
50
+ for row in table_data["table_info"]:
51
+ if not isinstance(row, dict):
52
+ continue
53
+ company_name = str(row.get(company_column_name, "")).strip()
54
+ if company_name and not fuzzy_match(company_name, EXCLUDE_COMPANY_NAMES):
55
+ companies.append(company_name)
56
+
57
+ return companies
58
+
59
+ def get_portfolio_company_list(intermediate_data: List[Dict]) -> List[str]:
60
+ """Extract portfolio companies from all tables in the document."""
61
+ portfolio_companies = set()
62
+
63
+ for entry in intermediate_data:
64
+ if "table_content" not in entry:
65
+ continue
66
+ for table in entry["table_content"]:
67
+ companies = extract_portfolio_companies_from_table(table)
68
+ portfolio_companies.update(companies)
69
+
70
+ return list(portfolio_companies)
71
+
72
+ def merge_content_under_same_header(
73
+ intermediate_data: List[Dict],
74
+ portfolio_company_list: List[str],
75
+ start_index: int
76
+ ) -> Dict:
77
+ """
78
+ Merge content under the same header until next company match is found.
79
+ Returns merged content and the next index to process.
80
+ """
81
+ merged_entry = {
82
+ "header": intermediate_data[start_index]["header"],
83
+ "content": intermediate_data[start_index].get("content", ""),
84
+ "table_content": intermediate_data[start_index].get("table_content", []),
85
+ "label_name": intermediate_data[start_index]["label_name"],
86
+ "page_number": intermediate_data[start_index]["page_number"],
87
+ "pdf_name": intermediate_data[start_index]["pdf_name"]
88
+ }
89
+
90
+ current_index = start_index + 1
91
+ while current_index < len(intermediate_data):
92
+ current_entry = intermediate_data[current_index]
93
+
94
+ # Check if we're still under the same header
95
+ if current_entry["header"] != merged_entry["header"]:
96
+ break
97
+
98
+ # Check if current entry matches any portfolio company
99
+ content_match = any(company in current_entry.get("content", "")
100
+ for company in portfolio_company_list)
101
+ table_match = False
102
+ for table in current_entry.get("table_content", []):
103
+ if extract_portfolio_companies_from_table(table):
104
+ table_match = True
105
+ break
106
+
107
+ if content_match or table_match:
108
+ break
109
+
110
+ # Merge content
111
+ if "content" in current_entry:
112
+ if merged_entry["content"]:
113
+ merged_entry["content"] += "\n" + current_entry["content"]
114
+ else:
115
+ merged_entry["content"] = current_entry["content"]
116
+
117
+ # Merge tables
118
+ if "table_content" in current_entry:
119
+ merged_entry["table_content"].extend(current_entry["table_content"])
120
+
121
+ current_index += 1
122
+
123
+ return merged_entry, current_index
124
+
125
+
126
+
127
+ def process_table_page_ids(merged_output):
128
+ """
129
+ Process the data to update the page_number key by combining its existing values with unique page numbers
130
+ from table_content metadata, for pages that contain table_content.
131
+
132
+ Args:
133
+ data (dict): Input data dictionary with page numbers as keys and page content as values.
134
+
135
+ Returns:
136
+ dict: Modified data with updated page_number key including existing and metadata page numbers.
137
+ """
138
+ # Iterate through each page in the data
139
+ for current_merged_entry in merged_output:
140
+ # Only process pages that have table_content
141
+ if 'table_content' in current_merged_entry:
142
+ # Initialize a set with existing page numbers from the page_number key
143
+ existing_page_numbers = set(current_merged_entry.get('page_number', '').split(',')) if current_merged_entry.get('page_number') else set()
144
+
145
+ # Add unique page numbers from table_content metadata
146
+ for table in current_merged_entry['table_content']:
147
+ if 'metadata' in table and 'table_page_id' in table['metadata']:
148
+ existing_page_numbers.add(str(table['metadata']['table_page_id']))
149
+
150
+ # Update the page_number key with sorted, unique page numbers
151
+ if existing_page_numbers:
152
+ current_merged_entry['page_number'] = ','.join(sorted(existing_page_numbers, key=int))
153
+
154
+ return merged_output
155
+
156
+
157
+ ################################################################################################################
158
+ ## Below function for more than one occurence of underlying_assets
159
+
160
+ def merge_portfolio_company_sections(intermediate_data: List[Dict]) -> tuple[List[Dict], List[str], List[str]]:
161
+ """Merge all content and tables under the same portfolio company header until next company is found.
162
+ Returns:
163
+ - merged_output: List of merged document sections
164
+ - fuzzy_matched_companies: List of companies that were fuzzy matched in headers
165
+ - portfolio_companies: List of all portfolio companies found in tables
166
+ """
167
+ portfolio_companies = get_portfolio_company_list(intermediate_data)
168
+ print(f"Extracted portfolio companies: {portfolio_companies}")
169
+
170
+ merged_output = []
171
+ # fuzzy_matched_companies = set()
172
+ current_chunk = None
173
+ active_company = None
174
+
175
+ for entry in intermediate_data:
176
+ # Find all companies in this entry's header
177
+ # header_companies = []
178
+ # for company in portfolio_companies:
179
+ # if fuzzy_match(entry["header"], [company], threshold=90):
180
+ # header_companies.append(company)
181
+ # fuzzy_matched_companies.add(company)
182
+ entry_copy = entry.copy()
183
+
184
+ header_companies, fuzzy_matched_companies = match_company_names(entry["header"], portfolio_companies)
185
+ # print("header_companies::",header_companies)
186
+ # print("fuzzy_matched_companies::",fuzzy_matched_companies)
187
+
188
+ if header_companies:
189
+ print("&"*100)
190
+ print("*"*100)
191
+ print("entry_header::", entry["header"])
192
+ print("page number of header::", entry["page_number"])
193
+
194
+ print("*"*100)
195
+ print("header_companies::", header_companies)
196
+ print("*"*100)
197
+
198
+ # If we have an active chunk, finalize it before starting new one
199
+ if current_chunk:
200
+ merged_output.append(current_chunk)
201
+ current_chunk = None
202
+ active_company = None
203
+
204
+ # Start new chunk with the first matched company
205
+ # (in case multiple companies matched, we take the first one)
206
+ active_company = header_companies[0]
207
+ current_chunk = {
208
+ "page_number": entry["page_number"],
209
+ "pdf_name": entry["pdf_name"],
210
+ "header": entry["header"],
211
+ "label_name": entry["label_name"],
212
+ "content": entry.get("content", ""),
213
+ "table_content": entry.get("table_content", []),
214
+ "matched_company": active_company
215
+ }
216
+
217
+ # If multiple companies matched, create separate chunks for others
218
+ for additional_company in header_companies[1:]:
219
+ merged_output.append({
220
+ "page_number": entry["page_number"],
221
+ "pdf_name": entry["pdf_name"],
222
+ "header": entry["header"],
223
+ "label_name": entry["label_name"],
224
+ "content": entry.get("content", ""),
225
+ "table_content": entry.get("table_content", []),
226
+ "matched_company": additional_company
227
+ })
228
+
229
+ elif current_chunk:
230
+ # Continue adding to current chunk if no new company detected
231
+ if "content" in entry:
232
+ if current_chunk["content"]:
233
+ current_chunk["content"] += "\n\n" + entry["content"]
234
+ current_chunk["page_number"] += "," + str(entry["page_number"])
235
+ page_numbers_list = list(dict.fromkeys(str(current_chunk["page_number"]).split(",")))
236
+ page_numbers_list = [num.strip() for num in page_numbers_list if num.strip()]
237
+ current_chunk["page_number"] = ",".join(page_numbers_list)
238
+
239
+ else:
240
+ current_chunk["content"] = entry["content"]
241
+ current_chunk["page_number"] = str(entry["page_number"])
242
+
243
+ if "table_content" in entry:
244
+ current_chunk["table_content"].extend(entry["table_content"])
245
+ if current_chunk["page_number"]:
246
+ if "metadata" in entry["table_content"]:
247
+ if "table_page_id" in entry["table_content"]["metadata"]:
248
+ current_chunk["page_number"] += "," + str(entry["table_content"]["metadata"]["table_page_id"])
249
+
250
+ current_chunk["page_number"] += "," + str(entry["page_number"])
251
+ page_numbers_list = list(dict.fromkeys(str(current_chunk["page_number"]).split(",")))
252
+ page_numbers_list = [num.strip() for num in page_numbers_list if num.strip()]
253
+ current_chunk["page_number"] = ",".join(page_numbers_list)
254
+
255
+ # if "page_number" in entry:
256
+ # if current_chunk["page_number"]:
257
+ # current_chunk["page_number"] += "," + str(entry["page_number"])
258
+ # else:
259
+ # current_chunk["page_number"] = str(entry["page_number"])
260
+
261
+ else:
262
+ # Ensure Unique page numbers for this entry
263
+ entry_copy = entry.copy()
264
+ if "page_number" in entry_copy :
265
+ page_numbers_list = list(dict.fromkeys(str(entry_copy["page_number"]).split(",")))
266
+ page_numbers_list = [num.strip() for num in page_numbers_list if num.strip()]
267
+ entry_copy["page_number"] = ",".join(page_numbers_list)
268
+
269
+ # Content before any company section
270
+ merged_output.append(entry_copy)
271
+
272
+ # Add the last active chunk if it exists
273
+ if current_chunk:
274
+ # Ensure Unique page numbers for last entry
275
+ page_numbers_list = list(dict.fromkeys(str(current_chunk["page_number"]).split(",")))
276
+ page_numbers_list = [num.strip() for num in page_numbers_list if num.strip()]
277
+ entry_copy["page_number"] = ",".join(page_numbers_list)
278
+ merged_output.append(current_chunk)
279
+
280
+ merged_output_new = process_table_page_ids(merged_output=merged_output)
281
+
282
+ return merged_output_new,fuzzy_matched_companies, portfolio_companies
283
+
284
+ ################################################################################################
285
+
286
+ ## Below code for using abbreviation funcnality
287
+
288
+ import re
289
+
290
+ def match_company_names(header_text: str, companies: List[str], threshold: int = FUZZY_MATCH_THRESHOLD) -> List[str]:
291
+ """Match company names in text, first checking header text abbreviations, then company abbreviations."""
292
+ header_text = str(header_text).lower().strip()
293
+ matched_companies = []
294
+ fuzzy_matched_companies = []
295
+
296
+ # Generate possible abbreviations for header_text
297
+ header_abbreviations = [
298
+ ''.join(word[0] for word in header_text.split() if word), # First letters of each word
299
+ re.sub(r'[aeiou\s]', '', header_text), # Remove vowels and spaces
300
+ header_text.replace(' ', '') # Remove spaces
301
+ ]
302
+
303
+ for company in companies:
304
+ company_lower = company.lower()
305
+
306
+ # First check: header text (full or abbreviated) against company full name
307
+ for header_pattern in [header_text] + header_abbreviations:
308
+ if fuzz.partial_ratio(header_pattern, company_lower) >= threshold:
309
+ matched_companies.append(company)
310
+ fuzzy_matched_companies.append(company) # Record as fuzzy match
311
+ break
312
+ else:
313
+ # Second check: header text against company abbreviations
314
+ company_abbreviations = [
315
+ ''.join(word[0] for word in company_lower.split() if word), # First letters of each word
316
+ re.sub(r'[aeiou\s]', '', company_lower), # Remove vowels and spaces
317
+ company_lower.replace(' ', '') # Remove spaces
318
+ ]
319
+ for company_pattern in company_abbreviations:
320
+ if fuzz.partial_ratio(header_text, company_pattern) >= threshold:
321
+ matched_companies.append(company)
322
+ fuzzy_matched_companies.append(company) # Record as fuzzy match
323
+ break
324
+
325
+ # Remove duplicates while preserving order
326
+ matched_companies = list(dict.fromkeys(matched_companies)) # Remove duplicates while preserving order
327
+ fuzzy_matched_companies = list(dict.fromkeys(fuzzy_matched_companies))
328
+
329
+ return matched_companies, fuzzy_matched_companies
330
+
331
+
332
+ ################################################################################################################
333
+
334
+ def process_document_company_wise(
335
+ intermediate_str_chunk_json: List[Dict],
336
+ output_directory: str,
337
+ file_name: str
338
+ ) -> List[Dict]:
339
+ """Process the document and return merged content in original format."""
340
+ # Convert string input to dict if needed
341
+ if isinstance(intermediate_str_chunk_json, str):
342
+ intermediate_str_chunk_json = json.loads(intermediate_str_chunk_json)
343
+
344
+ # Merge content by company sections
345
+ # merged_content,matched_company_list = merge_portfolio_company_sections(intermediate_str_chunk_json)
346
+ merged_content,matched_company_list,portfolio_company_list = merge_portfolio_company_sections(intermediate_str_chunk_json)
347
+ # merged_content[0]["companies_list"] = matched_company_list
348
+ merged_content[0]["portfolio_companies_list_fuzzy_matched"] = matched_company_list
349
+ merged_content[0]["portfolio_companies_list_before"] = portfolio_company_list
350
+
351
+ print("matched_company_list::",matched_company_list)
352
+ print("portfolio_company_list::",portfolio_company_list)
353
+
354
+ # Ensure output directory exists
355
+ os.makedirs(output_directory, exist_ok=True)
356
+
357
+ # Save output
358
+ output_path = os.path.join(output_directory, f"{file_name}_h2h_merged_output.json")
359
+ with open(output_path, "w", encoding="utf-8") as f:
360
+ json.dump(merged_content, f, indent=4, ensure_ascii=False)
361
+ print(f"Saved merged output to {output_path}")
362
+
363
+ return merged_content
364
+
365
+
366
+ def read_json(file_path):
367
+ """Reads a JSON file and returns the parsed data."""
368
+ with open(file_path, 'r', encoding='utf-8') as file:
369
+ data = json.load(file)
370
+ return data
371
+
372
+
373
+ # # Example usage
374
+ if __name__ == "__main__":
375
+ input_str_chunk_json_path="/shared_disk/kushal/db_str_chunking/new_ws_structured_code/Triton2023Q4_patria_sample_output/Triton2023Q4_patria_sample_json_output/Triton2023Q4_patria_sample_final_h2h_extraction.json"
376
+ input_json = read_json(input_str_chunk_json_path)
377
+
378
+ # Process the data
379
+ result = process_document_company_wise(
380
+ intermediate_str_chunk_json=input_json,
381
+ output_directory="db_structured_chunking/structure_chunking/src/iqeq_modification/testing_sample/output",
382
+ file_name="sample_report"
383
+ )
384
+
385
+ print("Processing complete.")
386
+ # print(json.dumps(result, indent=2))
387
+