anl139 commited on
Commit
4378ccc
·
verified ·
1 Parent(s): c845aaa

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +98 -24
app.py CHANGED
@@ -38,39 +38,113 @@ from pathlib import Path
38
  # Make sure to import your Document class from your LangChain module.
39
  from langchain_core.documents import Document
40
 
41
- def clean_org_text(text: str) -> str:
42
- """
43
- Removes metadata lines (e.g., Title, Organization, Website, etc.)
44
- from the organization text. Adjust the regex patterns as needed.
45
- """
46
- # Remove lines starting with known metadata keys
47
- metadata_keys = ["Title:", "Website:", "Twitter:", "Instagram:", "FaceBook:", "Newsletter:", "Year:", "Organization:", "Goal:", "Ranking:"]
48
- for key in metadata_keys:
49
- # Use regex to remove the key and everything up to the next key or end of string.
50
- text = re.sub(rf"{key}\s*.*?(?=(?:{'|'.join(metadata_keys)})|\Z)", '', text, flags=re.IGNORECASE | re.DOTALL)
51
- return text.strip()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
 
53
  def load_and_process_data(file_path: str):
54
- """
55
- Loads JSON data from a file, extracts organization text and metadata,
56
- cleans the org_text by removing redundant metadata, and returns a list of Documents.
57
- Documents will have the ranking metadata only if the organization is marked as a winner.
58
- """
59
  try:
60
  data = json.loads(Path(file_path).read_text(encoding='utf-8'))
61
  docs = []
62
  for entry in data:
63
- org_text_full = entry.get("OrganizationText", "")
64
- if not org_text_full:
65
  continue
66
- metadata = extract_metadata(org_text_full)
67
- # Create a cleaned version of the text (without the redundant metadata)
68
- org_text_clean = clean_org_text(org_text_full)
69
- # Insert winners at the beginning of the list
70
  if metadata.get("ranking", "").lower() == "winner":
71
- docs.insert(0, Document(page_content=org_text_clean, metadata=metadata))
72
  else:
73
- docs.append(Document(page_content=org_text_clean, metadata=metadata))
74
  return docs
75
  except Exception as e:
76
  print(f"Error loading JSON: {e}")
 
38
  # Make sure to import your Document class from your LangChain module.
39
  from langchain_core.documents import Document
40
 
41
+ def extract_metadata(text: str) -> tuple[dict, str]:
42
+ metadata = {}
43
+ cleaned_text = text # Start with the original text
44
+
45
+ # Extract and remove Title
46
+ title_match = re.search(
47
+ r"Title:\s*(.*?)\s+(?=Website:|Twitter:|Instagram:|FaceBook:|Newsletter:)",
48
+ cleaned_text,
49
+ re.IGNORECASE | re.DOTALL
50
+ )
51
+ if title_match:
52
+ metadata["title"] = title_match.group(1).strip()
53
+ # Remove Title from cleaned_text
54
+ cleaned_text = re.sub(
55
+ r"Title:\s*.*?(?=Website:|Twitter:|Instagram:|FaceBook:|Newsletter:)",
56
+ "",
57
+ cleaned_text,
58
+ flags=re.IGNORECASE | re.DOTALL
59
+ )
60
+
61
+ # Extract and remove Ranking (only if "winner")
62
+ ranking_match = re.search(
63
+ r"Ranking:\s*(.*?)\s+(?=Impact Metrics:|$)",
64
+ cleaned_text,
65
+ re.IGNORECASE | re.DOTALL
66
+ )
67
+ if ranking_match:
68
+ ranking_value = ranking_match.group(1).strip()
69
+ if ranking_value.lower() == "winner":
70
+ metadata["ranking"] = ranking_value
71
+ # Remove Ranking from cleaned_text
72
+ cleaned_text = re.sub(
73
+ r"Ranking:\s*.*?(?=Impact Metrics:|$)",
74
+ "",
75
+ cleaned_text,
76
+ flags=re.IGNORECASE | re.DOTALL
77
+ )
78
+
79
+ # Extract and remove Year
80
+ year_match = re.search(r"Year:\s*(\d{4})", cleaned_text, re.IGNORECASE)
81
+ if year_match:
82
+ metadata["year"] = year_match.group(1).strip()
83
+ # Remove Year from cleaned_text
84
+ cleaned_text = re.sub(r"Year:\s*\d{4}", "", cleaned_text, flags=re.IGNORECASE)
85
+
86
+ # Extract and remove Organization
87
+ org_match = re.search(
88
+ r"Organization:\s*(.*?)\s+(?=Goal:|Ranking:|Impact Metrics:)",
89
+ cleaned_text,
90
+ re.IGNORECASE | re.DOTALL
91
+ )
92
+ if org_match:
93
+ metadata["organization"] = org_match.group(1).strip()
94
+ # Remove Organization from cleaned_text
95
+ cleaned_text = re.sub(
96
+ r"Organization:\s*.*?(?=Goal:|Ranking:|Impact Metrics:)",
97
+ "",
98
+ cleaned_text,
99
+ flags=re.IGNORECASE | re.DOTALL
100
+ )
101
+
102
+ # Extract and remove URLs (Website, Volunteer, Newsletter)
103
+ urls = re.findall(r"(Website|Volunteer|Newsletter):\s*((?:https?://)?\S+)", cleaned_text)
104
+ for key, url in urls:
105
+ metadata[key.lower()] = url.strip()
106
+ # Remove URL from cleaned_text
107
+ cleaned_text = re.sub(
108
+ rf"{key}:\s*{re.escape(url)}",
109
+ "",
110
+ cleaned_text,
111
+ flags=re.IGNORECASE
112
+ )
113
+
114
+ # Extract and remove social handles
115
+ social = re.findall(r"(Twitter|Instagram|FaceBook):\s*(\S+)", cleaned_text)
116
+ for platform, handle in social:
117
+ if handle.startswith("http"):
118
+ metadata[platform.lower()] = handle.strip()
119
+ else:
120
+ metadata[f"{platform.lower()}_handle"] = f"https://{platform.lower()}.com/{handle.strip()}"
121
+ # Remove social handle from cleaned_text
122
+ cleaned_text = re.sub(
123
+ rf"{platform}:\s*{re.escape(handle)}",
124
+ "",
125
+ cleaned_text,
126
+ flags=re.IGNORECASE
127
+ )
128
+
129
+ # Clean up extra whitespace
130
+ cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()
131
+
132
+ return metadata, cleaned_text
133
+
134
 
135
  def load_and_process_data(file_path: str):
 
 
 
 
 
136
  try:
137
  data = json.loads(Path(file_path).read_text(encoding='utf-8'))
138
  docs = []
139
  for entry in data:
140
+ org_text = entry.get("OrganizationText", "")
141
+ if not org_text:
142
  continue
143
+ metadata, cleaned_text = extract_metadata(org_text) # Now returns cleaned text
 
 
 
144
  if metadata.get("ranking", "").lower() == "winner":
145
+ docs.insert(0, Document(page_content=cleaned_text, metadata=metadata))
146
  else:
147
+ docs.append(Document(page_content=cleaned_text, metadata=metadata))
148
  return docs
149
  except Exception as e:
150
  print(f"Error loading JSON: {e}")