Spaces:

anl139
/

test

Sleeping

App Files Files Community

anl139 commited on Feb 11, 2025

Commit

c845aaa

verified ·

1 Parent(s): 9046a80

Update app.py

Browse files

Files changed (1) hide show

app.py +20 -60

app.py CHANGED Viewed

@@ -38,83 +38,43 @@ from pathlib import Path
 # Make sure to import your Document class from your LangChain module.
 from langchain_core.documents import Document
-def extract_metadata(text: str) -> dict:
-    metadata = {}
-    # Extract the Title field
-    title_match = re.search(
-        r"Title:\s*(.*?)\s+(?=Website:|Twitter:|Instagram:|FaceBook:|Newsletter:)",
-        text,
-        re.IGNORECASE | re.DOTALL
-    )
-    if title_match:
-        metadata["title"] = title_match.group(1).strip()
-    # Extract the Ranking field but only add it if the value is "winner"
-    ranking_match = re.search(
-        r"Ranking:\s*(.*?)\s+(?=Impact Metrics:|$)",
-        text,
-        re.IGNORECASE | re.DOTALL
-    )
-    if ranking_match:
-        ranking_value = ranking_match.group(1).strip()
-        if ranking_value.lower() == "winner":
-            metadata["ranking"] = ranking_value
-    # Extract the Year field (assuming a four-digit year)
-    year_match = re.search(r"Year:\s*(\d{4})", text, re.IGNORECASE)
-    if year_match:
-        metadata["year"] = year_match.group(1).strip()
-    # Extract the Organization field
-    org_match = re.search(
-        r"Organization:\s*(.*?)\s+(?=Goal:|Ranking:|Impact Metrics:)",
-        text,
-        re.IGNORECASE | re.DOTALL
-    )
-    if org_match:
-        metadata["organization"] = org_match.group(1).strip()
-    # Modified URL extraction: make http/https optional.
-    urls = re.findall(r"(Website|Volunteer|Newsletter):\s*((?:https?://)?\S+)", text)
-    for key, url in urls:
-        metadata[key.lower()] = url.strip()
-    # Adjust social handle extraction to capture full URLs.
-    social = re.findall(r"(Twitter|Instagram|FaceBook):\s*(\S+)", text)
-    for platform, handle in social:
-        if handle.startswith("http"):
-            metadata[platform.lower()] = handle.strip()
-        else:
-            metadata[f"{platform.lower()}_handle"] = f"https://{platform.lower()}.com/{handle.strip()}"
-    return metadata
 def load_and_process_data(file_path: str):
     """
     Loads JSON data from a file, extracts organization text and metadata,
-    and returns a list of Documents. Documents will have the ranking metadata
-    only if the organization is marked as a winner.
     """
     try:
         data = json.loads(Path(file_path).read_text(encoding='utf-8'))
         docs = []
         for entry in data:
-            org_text = entry.get("OrganizationText", "")
-            if not org_text:
                 continue
-            metadata = extract_metadata(org_text)
             # Insert winners at the beginning of the list
             if metadata.get("ranking", "").lower() == "winner":
-                docs.insert(0, Document(page_content=org_text, metadata=metadata))
             else:
-                docs.append(Document(page_content=org_text, metadata=metadata))
         return docs
     except Exception as e:
         print(f"Error loading JSON: {e}")
         return []
 # -------------------------------
 # Data Loading and Preprocessing
 # -------------------------------

 # Make sure to import your Document class from your LangChain module.
 from langchain_core.documents import Document
+def clean_org_text(text: str) -> str:
+    """
+    Removes metadata lines (e.g., Title, Organization, Website, etc.)
+    from the organization text. Adjust the regex patterns as needed.
+    """
+    # Remove lines starting with known metadata keys
+    metadata_keys = ["Title:", "Website:", "Twitter:", "Instagram:", "FaceBook:", "Newsletter:", "Year:", "Organization:", "Goal:", "Ranking:"]
+    for key in metadata_keys:
+        # Use regex to remove the key and everything up to the next key or end of string.
+        text = re.sub(rf"{key}\s*.*?(?=(?:{'|'.join(metadata_keys)})|\Z)", '', text, flags=re.IGNORECASE | re.DOTALL)
+    return text.strip()
 def load_and_process_data(file_path: str):
     """
     Loads JSON data from a file, extracts organization text and metadata,
+    cleans the org_text by removing redundant metadata, and returns a list of Documents.
+    Documents will have the ranking metadata only if the organization is marked as a winner.
     """
     try:
         data = json.loads(Path(file_path).read_text(encoding='utf-8'))
         docs = []
         for entry in data:
+            org_text_full = entry.get("OrganizationText", "")
+            if not org_text_full:
                 continue
+            metadata = extract_metadata(org_text_full)
+            # Create a cleaned version of the text (without the redundant metadata)
+            org_text_clean = clean_org_text(org_text_full)
             # Insert winners at the beginning of the list
             if metadata.get("ranking", "").lower() == "winner":
+                docs.insert(0, Document(page_content=org_text_clean, metadata=metadata))
             else:
+                docs.append(Document(page_content=org_text_clean, metadata=metadata))
         return docs
     except Exception as e:
         print(f"Error loading JSON: {e}")
         return []
 # -------------------------------
 # Data Loading and Preprocessing
 # -------------------------------