anl139 commited on
Commit
028664e
·
verified ·
1 Parent(s): 3797703

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +28 -104
app.py CHANGED
@@ -38,136 +38,58 @@ from pathlib import Path
38
  # Make sure to import your Document class from your LangChain module.
39
  from langchain_core.documents import Document
40
 
41
- def extract_metadata(text: str) -> tuple[dict, str]:
42
  metadata = {}
43
- cleaned_text = text # Start with the original text
44
 
45
- # Extract and remove Title
46
  title_match = re.search(
47
  r"Title:\s*(.*?)\s+(?=Website:|Twitter:|Instagram:|FaceBook:|Newsletter:)",
48
- cleaned_text,
49
  re.IGNORECASE | re.DOTALL
50
  )
51
  if title_match:
52
  metadata["title"] = title_match.group(1).strip()
53
- cleaned_text = re.sub(
54
- r"Title:\s*.*?(?=Website:|Twitter:|Instagram:|FaceBook:|Newsletter:)",
55
- "",
56
- cleaned_text,
57
- flags=re.IGNORECASE | re.DOTALL
58
- )
59
 
60
- # Extract and remove Ranking (only if "winner")
 
 
 
 
 
 
 
 
 
61
  ranking_match = re.search(
62
- r"Ranking:\s*(.*?)\s+(?=Impact Metrics:|$)",
63
- cleaned_text,
64
  re.IGNORECASE | re.DOTALL
65
  )
66
  if ranking_match:
67
- ranking_value = ranking_match.group(1).strip()
68
- if ranking_value.lower() == "winner":
69
- metadata["ranking"] = ranking_value
70
- cleaned_text = re.sub(
71
- r"Ranking:\s*.*?(?=Impact Metrics:|$)",
72
- "",
73
- cleaned_text,
74
- flags=re.IGNORECASE | re.DOTALL
75
- )
76
 
77
- # Extract and remove Year
78
- year_match = re.search(r"Year:\s*(\d{4})", cleaned_text, re.IGNORECASE)
79
  if year_match:
80
  metadata["year"] = year_match.group(1).strip()
81
- cleaned_text = re.sub(r"Year:\s*\d{4}", "", cleaned_text, flags=re.IGNORECASE)
82
-
83
- # Extract and remove Organization
84
- org_match = re.search(
85
- r"Organization:\s*(.*?)\s+(?=Goal:|Ranking:|Impact Metrics:)",
86
- cleaned_text,
87
- re.IGNORECASE | re.DOTALL
88
- )
89
- if org_match:
90
- metadata["organization"] = org_match.group(1).strip()
91
- cleaned_text = re.sub(
92
- r"Organization:\s*.*?(?=Goal:|Ranking:|Impact Metrics:)",
93
- "",
94
- cleaned_text,
95
- flags=re.IGNORECASE | re.DOTALL
96
- )
97
 
98
- # Extract and remove URLs (Website, Volunteer, Newsletter)
99
- urls = re.findall(r"(Website|Volunteer|Newsletter):\s*((?:https?://)?\S+)", cleaned_text)
100
  for key, url in urls:
101
  metadata[key.lower()] = url.strip()
102
- cleaned_text = re.sub(
103
- rf"{key}:\s*{re.escape(url)}",
104
- "",
105
- cleaned_text,
106
- flags=re.IGNORECASE
107
- )
108
 
109
- # Extract and remove social handles (Twitter, Instagram, FaceBook)
110
- social = re.findall(r"(Twitter|Instagram|FaceBook):\s*(\S+)", cleaned_text)
111
  for platform, handle in social:
112
  if handle.startswith("http"):
113
  metadata[platform.lower()] = handle.strip()
114
  else:
115
  metadata[f"{platform.lower()}_handle"] = f"https://{platform.lower()}.com/{handle.strip()}"
116
- cleaned_text = re.sub(
117
- rf"{platform}:\s*{re.escape(handle)}",
118
- "",
119
- cleaned_text,
120
- flags=re.IGNORECASE
121
- )
122
-
123
- # Extract and remove Working Areas in LA
124
- working_match = re.search(
125
- r"Working Areas in LA:\s*(.*?)\s+(?=Summary:|Ranking:|Impact Metrics:|$)",
126
- cleaned_text,
127
- re.IGNORECASE | re.DOTALL
128
- )
129
- if working_match:
130
- metadata["working_areas"] = working_match.group(1).strip()
131
- cleaned_text = re.sub(
132
- r"Working Areas in LA:\s*.*?(?=Summary:|Ranking:|Impact Metrics:|$)",
133
- "",
134
- cleaned_text,
135
- flags=re.IGNORECASE | re.DOTALL
136
- )
137
 
138
- # Extract and remove Zipcode (assuming 5-digit US zipcodes)
139
- zipcode_match = re.search(r"Zipcode:\s*(\d{5})", cleaned_text, re.IGNORECASE)
140
- if zipcode_match:
141
- metadata["zipcode"] = zipcode_match.group(1).strip()
142
- cleaned_text = re.sub(r"Zipcode:\s*\d{5}", "", cleaned_text, flags=re.IGNORECASE)
143
-
144
- # Clean up extra whitespace
145
- cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()
146
-
147
- # Create a metadata summary to append to the cleaned text.
148
- meta_summary = ""
149
- if "year" in metadata:
150
- meta_summary += f"Year: {metadata['year']}. "
151
- if "ranking" in metadata:
152
- meta_summary += f"Ranking: {metadata['ranking']}. "
153
- if "organization" in metadata:
154
- meta_summary += f"Organization: {metadata['organization']}. "
155
- if "working_areas" in metadata:
156
- meta_summary += f"Working Areas in LA: {metadata['working_areas']}. "
157
- if "zipcode" in metadata:
158
- meta_summary += f"Zipcode: {metadata['zipcode']}. "
159
-
160
- combined_text = meta_summary + "\n" + cleaned_text if meta_summary else cleaned_text
161
-
162
- return metadata, combined_text
163
 
164
 
165
  def load_and_process_data(file_path: str):
166
- """
167
- Loads JSON data from a file, extracts organization text and metadata (including working areas and zipcode),
168
- cleans the text by removing redundant metadata, and returns a list of Documents.
169
- Documents with a "winner" ranking are inserted at the beginning of the list.
170
- """
171
  try:
172
  data = json.loads(Path(file_path).read_text(encoding='utf-8'))
173
  docs = []
@@ -175,16 +97,18 @@ def load_and_process_data(file_path: str):
175
  org_text = entry.get("OrganizationText", "")
176
  if not org_text:
177
  continue
178
- metadata, combined_text = extract_metadata(org_text)
 
179
  if metadata.get("ranking", "").lower() == "winner":
180
- docs.insert(0, Document(page_content=combined_text, metadata=metadata))
181
  else:
182
- docs.append(Document(page_content=combined_text, metadata=metadata))
183
  return docs
184
  except Exception as e:
185
  print(f"Error loading JSON: {e}")
186
  return []
187
 
 
188
  # -------------------------------
189
  # Data Loading and Preprocessing
190
  # -------------------------------
 
38
  # Make sure to import your Document class from your LangChain module.
39
  from langchain_core.documents import Document
40
 
41
+ def extract_metadata(text: str) -> dict:
42
  metadata = {}
 
43
 
44
+ # Extract the Title field
45
  title_match = re.search(
46
  r"Title:\s*(.*?)\s+(?=Website:|Twitter:|Instagram:|FaceBook:|Newsletter:)",
47
+ text,
48
  re.IGNORECASE | re.DOTALL
49
  )
50
  if title_match:
51
  metadata["title"] = title_match.group(1).strip()
 
 
 
 
 
 
52
 
53
+ # Extract the Organization field
54
+ org_match = re.search(
55
+ r"Organization:\s*(.*?)\s+(?=Goal:|Ranking:|Impact Metrics:)",
56
+ text,
57
+ re.IGNORECASE | re.DOTALL
58
+ )
59
+ if org_match:
60
+ metadata["organization"] = org_match.group(1).strip()
61
+
62
+ # Extract the Ranking field with a more flexible pattern:
63
  ranking_match = re.search(
64
+ r"Ranking:\s*(.*?)\s*(?:Impact Metrics:|$)",
65
+ text,
66
  re.IGNORECASE | re.DOTALL
67
  )
68
  if ranking_match:
69
+ metadata["ranking"] = ranking_match.group(1).strip()
 
 
 
 
 
 
 
 
70
 
71
+ # Extract the Year field (assuming a four-digit year)
72
+ year_match = re.search(r"Year:\s*(\d{4})", text, re.IGNORECASE)
73
  if year_match:
74
  metadata["year"] = year_match.group(1).strip()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
75
 
76
+ # Extract URLs for Website, Volunteer, and Newsletter
77
+ urls = re.findall(r"(Website|Volunteer|Newsletter):\s*((?:https?://)?\S+)", text)
78
  for key, url in urls:
79
  metadata[key.lower()] = url.strip()
 
 
 
 
 
 
80
 
81
+ # Extract social handles (Twitter, Instagram, FaceBook)
82
+ social = re.findall(r"(Twitter|Instagram|FaceBook):\s*(\S+)", text)
83
  for platform, handle in social:
84
  if handle.startswith("http"):
85
  metadata[platform.lower()] = handle.strip()
86
  else:
87
  metadata[f"{platform.lower()}_handle"] = f"https://{platform.lower()}.com/{handle.strip()}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
88
 
89
+ return metadata
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
90
 
91
 
92
  def load_and_process_data(file_path: str):
 
 
 
 
 
93
  try:
94
  data = json.loads(Path(file_path).read_text(encoding='utf-8'))
95
  docs = []
 
97
  org_text = entry.get("OrganizationText", "")
98
  if not org_text:
99
  continue
100
+ metadata = extract_metadata(org_text)
101
+ # Optionally, prioritize winners
102
  if metadata.get("ranking", "").lower() == "winner":
103
+ docs.insert(0, Document(page_content=org_text, metadata=metadata))
104
  else:
105
+ docs.append(Document(page_content=org_text, metadata=metadata))
106
  return docs
107
  except Exception as e:
108
  print(f"Error loading JSON: {e}")
109
  return []
110
 
111
+
112
  # -------------------------------
113
  # Data Loading and Preprocessing
114
  # -------------------------------