anl139 commited on
Commit
c845aaa
·
verified ·
1 Parent(s): 9046a80

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +20 -60
app.py CHANGED
@@ -38,83 +38,43 @@ from pathlib import Path
38
  # Make sure to import your Document class from your LangChain module.
39
  from langchain_core.documents import Document
40
 
41
- def extract_metadata(text: str) -> dict:
42
- metadata = {}
43
-
44
- # Extract the Title field
45
- title_match = re.search(
46
- r"Title:\s*(.*?)\s+(?=Website:|Twitter:|Instagram:|FaceBook:|Newsletter:)",
47
- text,
48
- re.IGNORECASE | re.DOTALL
49
- )
50
- if title_match:
51
- metadata["title"] = title_match.group(1).strip()
52
-
53
- # Extract the Ranking field but only add it if the value is "winner"
54
- ranking_match = re.search(
55
- r"Ranking:\s*(.*?)\s+(?=Impact Metrics:|$)",
56
- text,
57
- re.IGNORECASE | re.DOTALL
58
- )
59
- if ranking_match:
60
- ranking_value = ranking_match.group(1).strip()
61
- if ranking_value.lower() == "winner":
62
- metadata["ranking"] = ranking_value
63
-
64
- # Extract the Year field (assuming a four-digit year)
65
- year_match = re.search(r"Year:\s*(\d{4})", text, re.IGNORECASE)
66
- if year_match:
67
- metadata["year"] = year_match.group(1).strip()
68
-
69
- # Extract the Organization field
70
- org_match = re.search(
71
- r"Organization:\s*(.*?)\s+(?=Goal:|Ranking:|Impact Metrics:)",
72
- text,
73
- re.IGNORECASE | re.DOTALL
74
- )
75
- if org_match:
76
- metadata["organization"] = org_match.group(1).strip()
77
-
78
- # Modified URL extraction: make http/https optional.
79
- urls = re.findall(r"(Website|Volunteer|Newsletter):\s*((?:https?://)?\S+)", text)
80
- for key, url in urls:
81
- metadata[key.lower()] = url.strip()
82
-
83
- # Adjust social handle extraction to capture full URLs.
84
- social = re.findall(r"(Twitter|Instagram|FaceBook):\s*(\S+)", text)
85
- for platform, handle in social:
86
- if handle.startswith("http"):
87
- metadata[platform.lower()] = handle.strip()
88
- else:
89
- metadata[f"{platform.lower()}_handle"] = f"https://{platform.lower()}.com/{handle.strip()}"
90
-
91
- return metadata
92
-
93
 
94
  def load_and_process_data(file_path: str):
95
  """
96
  Loads JSON data from a file, extracts organization text and metadata,
97
- and returns a list of Documents. Documents will have the ranking metadata
98
- only if the organization is marked as a winner.
99
  """
100
  try:
101
  data = json.loads(Path(file_path).read_text(encoding='utf-8'))
102
  docs = []
103
  for entry in data:
104
- org_text = entry.get("OrganizationText", "")
105
- if not org_text:
106
  continue
107
- metadata = extract_metadata(org_text)
 
 
108
  # Insert winners at the beginning of the list
109
  if metadata.get("ranking", "").lower() == "winner":
110
- docs.insert(0, Document(page_content=org_text, metadata=metadata))
111
  else:
112
- docs.append(Document(page_content=org_text, metadata=metadata))
113
  return docs
114
  except Exception as e:
115
  print(f"Error loading JSON: {e}")
116
  return []
117
-
118
  # -------------------------------
119
  # Data Loading and Preprocessing
120
  # -------------------------------
 
38
  # Make sure to import your Document class from your LangChain module.
39
  from langchain_core.documents import Document
40
 
41
+ def clean_org_text(text: str) -> str:
42
+ """
43
+ Removes metadata lines (e.g., Title, Organization, Website, etc.)
44
+ from the organization text. Adjust the regex patterns as needed.
45
+ """
46
+ # Remove lines starting with known metadata keys
47
+ metadata_keys = ["Title:", "Website:", "Twitter:", "Instagram:", "FaceBook:", "Newsletter:", "Year:", "Organization:", "Goal:", "Ranking:"]
48
+ for key in metadata_keys:
49
+ # Use regex to remove the key and everything up to the next key or end of string.
50
+ text = re.sub(rf"{key}\s*.*?(?=(?:{'|'.join(metadata_keys)})|\Z)", '', text, flags=re.IGNORECASE | re.DOTALL)
51
+ return text.strip()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
 
53
  def load_and_process_data(file_path: str):
54
  """
55
  Loads JSON data from a file, extracts organization text and metadata,
56
+ cleans the org_text by removing redundant metadata, and returns a list of Documents.
57
+ Documents will have the ranking metadata only if the organization is marked as a winner.
58
  """
59
  try:
60
  data = json.loads(Path(file_path).read_text(encoding='utf-8'))
61
  docs = []
62
  for entry in data:
63
+ org_text_full = entry.get("OrganizationText", "")
64
+ if not org_text_full:
65
  continue
66
+ metadata = extract_metadata(org_text_full)
67
+ # Create a cleaned version of the text (without the redundant metadata)
68
+ org_text_clean = clean_org_text(org_text_full)
69
  # Insert winners at the beginning of the list
70
  if metadata.get("ranking", "").lower() == "winner":
71
+ docs.insert(0, Document(page_content=org_text_clean, metadata=metadata))
72
  else:
73
+ docs.append(Document(page_content=org_text_clean, metadata=metadata))
74
  return docs
75
  except Exception as e:
76
  print(f"Error loading JSON: {e}")
77
  return []
 
78
  # -------------------------------
79
  # Data Loading and Preprocessing
80
  # -------------------------------