Spaces:

anl139
/

test

Sleeping

App Files Files Community

anl139 commited on Feb 11, 2025

Commit

4378ccc

verified ·

1 Parent(s): c845aaa

Update app.py

Browse files

Files changed (1) hide show

app.py +98 -24

app.py CHANGED Viewed

@@ -38,39 +38,113 @@ from pathlib import Path
 # Make sure to import your Document class from your LangChain module.
 from langchain_core.documents import Document
-def clean_org_text(text: str) -> str:
-    """
-    Removes metadata lines (e.g., Title, Organization, Website, etc.)
-    from the organization text. Adjust the regex patterns as needed.
-    """
-    # Remove lines starting with known metadata keys
-    metadata_keys = ["Title:", "Website:", "Twitter:", "Instagram:", "FaceBook:", "Newsletter:", "Year:", "Organization:", "Goal:", "Ranking:"]
-    for key in metadata_keys:
-        # Use regex to remove the key and everything up to the next key or end of string.
-        text = re.sub(rf"{key}\s*.*?(?=(?:{'|'.join(metadata_keys)})|\Z)", '', text, flags=re.IGNORECASE | re.DOTALL)
-    return text.strip()
 def load_and_process_data(file_path: str):
-    """
-    Loads JSON data from a file, extracts organization text and metadata,
-    cleans the org_text by removing redundant metadata, and returns a list of Documents.
-    Documents will have the ranking metadata only if the organization is marked as a winner.
-    """
     try:
         data = json.loads(Path(file_path).read_text(encoding='utf-8'))
         docs = []
         for entry in data:
-            org_text_full = entry.get("OrganizationText", "")
-            if not org_text_full:
                 continue
-            metadata = extract_metadata(org_text_full)
-            # Create a cleaned version of the text (without the redundant metadata)
-            org_text_clean = clean_org_text(org_text_full)
-            # Insert winners at the beginning of the list
             if metadata.get("ranking", "").lower() == "winner":
-                docs.insert(0, Document(page_content=org_text_clean, metadata=metadata))
             else:
-                docs.append(Document(page_content=org_text_clean, metadata=metadata))
         return docs
     except Exception as e:
         print(f"Error loading JSON: {e}")

 # Make sure to import your Document class from your LangChain module.
 from langchain_core.documents import Document
+def extract_metadata(text: str) -> tuple[dict, str]:
+    metadata = {}
+    cleaned_text = text  # Start with the original text
+    # Extract and remove Title
+    title_match = re.search(
+        r"Title:\s*(.*?)\s+(?=Website:|Twitter:|Instagram:|FaceBook:|Newsletter:)",
+        cleaned_text,
+        re.IGNORECASE | re.DOTALL
+    )
+    if title_match:
+        metadata["title"] = title_match.group(1).strip()
+        # Remove Title from cleaned_text
+        cleaned_text = re.sub(
+            r"Title:\s*.*?(?=Website:|Twitter:|Instagram:|FaceBook:|Newsletter:)",
+            "",
+            cleaned_text,
+            flags=re.IGNORECASE | re.DOTALL
+        )
+    # Extract and remove Ranking (only if "winner")
+    ranking_match = re.search(
+        r"Ranking:\s*(.*?)\s+(?=Impact Metrics:|$)",
+        cleaned_text,
+        re.IGNORECASE | re.DOTALL
+    )
+    if ranking_match:
+        ranking_value = ranking_match.group(1).strip()
+        if ranking_value.lower() == "winner":
+            metadata["ranking"] = ranking_value
+        # Remove Ranking from cleaned_text
+        cleaned_text = re.sub(
+            r"Ranking:\s*.*?(?=Impact Metrics:|$)",
+            "",
+            cleaned_text,
+            flags=re.IGNORECASE | re.DOTALL
+        )
+    # Extract and remove Year
+    year_match = re.search(r"Year:\s*(\d{4})", cleaned_text, re.IGNORECASE)
+    if year_match:
+        metadata["year"] = year_match.group(1).strip()
+        # Remove Year from cleaned_text
+        cleaned_text = re.sub(r"Year:\s*\d{4}", "", cleaned_text, flags=re.IGNORECASE)
+    # Extract and remove Organization
+    org_match = re.search(
+        r"Organization:\s*(.*?)\s+(?=Goal:|Ranking:|Impact Metrics:)",
+        cleaned_text,
+        re.IGNORECASE | re.DOTALL
+    )
+    if org_match:
+        metadata["organization"] = org_match.group(1).strip()
+        # Remove Organization from cleaned_text
+        cleaned_text = re.sub(
+            r"Organization:\s*.*?(?=Goal:|Ranking:|Impact Metrics:)",
+            "",
+            cleaned_text,
+            flags=re.IGNORECASE | re.DOTALL
+        )
+    # Extract and remove URLs (Website, Volunteer, Newsletter)
+    urls = re.findall(r"(Website|Volunteer|Newsletter):\s*((?:https?://)?\S+)", cleaned_text)
+    for key, url in urls:
+        metadata[key.lower()] = url.strip()
+        # Remove URL from cleaned_text
+        cleaned_text = re.sub(
+            rf"{key}:\s*{re.escape(url)}",
+            "",
+            cleaned_text,
+            flags=re.IGNORECASE
+        )
+    # Extract and remove social handles
+    social = re.findall(r"(Twitter|Instagram|FaceBook):\s*(\S+)", cleaned_text)
+    for platform, handle in social:
+        if handle.startswith("http"):
+            metadata[platform.lower()] = handle.strip()
+        else:
+            metadata[f"{platform.lower()}_handle"] = f"https://{platform.lower()}.com/{handle.strip()}"
+        # Remove social handle from cleaned_text
+        cleaned_text = re.sub(
+            rf"{platform}:\s*{re.escape(handle)}",
+            "",
+            cleaned_text,
+            flags=re.IGNORECASE
+        )
+    # Clean up extra whitespace
+    cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()
+    return metadata, cleaned_text
 def load_and_process_data(file_path: str):
     try:
         data = json.loads(Path(file_path).read_text(encoding='utf-8'))
         docs = []
         for entry in data:
+            org_text = entry.get("OrganizationText", "")
+            if not org_text:
                 continue
+            metadata, cleaned_text = extract_metadata(org_text)  # Now returns cleaned text
             if metadata.get("ranking", "").lower() == "winner":
+                docs.insert(0, Document(page_content=cleaned_text, metadata=metadata))
             else:
+                docs.append(Document(page_content=cleaned_text, metadata=metadata))
         return docs
     except Exception as e:
         print(f"Error loading JSON: {e}")