anl139 commited on
Commit
4ac8a78
·
verified ·
1 Parent(s): 85d9ea1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +35 -90
app.py CHANGED
@@ -31,106 +31,51 @@ os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY')
31
  # Utility Functions
32
  # -------------------------------
33
 
34
- def extract_metadata(text: str) -> dict:
35
- metadata = {}
36
-
37
- # Extract the Title field
38
- title_match = re.search(
39
- r"Title:\s*(.*?)\s+(?=Website:|Twitter:|Instagram:|FaceBook:|Newsletter:)",
40
- text,
41
- re.IGNORECASE | re.DOTALL
42
- )
43
- if title_match:
44
- metadata["title"] = title_match.group(1).strip()
45
-
46
- # Extract the Ranking field and store raw ranking info
47
- ranking_match = re.search(r"Ranking:\s*([^\n]+)", text, re.IGNORECASE)
48
- if ranking_match:
49
- ranking_value = ranking_match.group(1).strip()
50
- metadata["raw_ranking"] = ranking_value # Store full ranking information
51
- if ranking_value.lower() == "winner":
52
- metadata["LA2050 Grant Winner"] = ranking_value
53
-
54
- # Extract the Year field (assuming a four-digit year)
55
- year_match = re.search(r"Year:\s*(\d{4})", text, re.IGNORECASE)
56
- if year_match:
57
- metadata["year"] = year_match.group(1).strip()
58
-
59
- # Extract the Organization field
60
- org_match = re.search(
61
- r"Organization:\s*(.*?)\s+(?=Goal:|Ranking:|Impact Metrics:)",
62
- text,
63
- re.IGNORECASE | re.DOTALL
64
- )
65
- if org_match:
66
- metadata["organization"] = org_match.group(1).strip()
67
-
68
- # Modified URL extraction: make http/https optional.
69
- urls = re.findall(r"(Website|Volunteer|Newsletter):\s*((?:https?://)?\S+)", text)
70
- for key, url in urls:
71
- metadata[key.lower()] = url.strip()
72
-
73
- # Adjust social handle extraction to capture full URLs.
74
- social = re.findall(r"(Twitter|Instagram|FaceBook):\s*(\S+)", text)
75
- for platform, handle in social:
76
- if handle.startswith("http"):
77
- metadata[platform.lower()] = handle.strip()
78
- else:
79
- metadata[f"{platform.lower()}_handle"] = f"https://{platform.lower()}.com/{handle.strip()}"
80
-
81
- # Extract Working Areas in LA (if available)
82
- working_match = re.search(r"Working Areas in LA:\s*(.*?)\s+(?=Summary:|$)", text, re.IGNORECASE | re.DOTALL)
83
- if working_match:
84
- metadata["working_areas_in_la"] = working_match.group(1).strip()
85
-
86
- # Extract Zipcode (if available; assuming it is a 5-digit number)
87
- zipcode_match = re.search(r"Zipcode:\s*(\d{5})", text, re.IGNORECASE)
88
- if zipcode_match:
89
- metadata["zipcode"] = zipcode_match.group(1).strip()
90
-
91
- return metadata
92
-
93
-
94
- def load_and_process_data(file_path: str):
95
- """
96
- Loads JSON data from a file, extracts organization text and metadata,
97
- and returns a list of Documents. Documents will have the ranking metadata
98
- only if the organization is marked as a winner.
99
- """
100
- try:
101
- data = json.loads(Path(file_path).read_text(encoding='utf-8'))
102
- docs = []
103
- for entry in data:
104
- org_text = entry.get("OrganizationText", "")
105
- if not org_text:
106
- continue
107
- metadata = extract_metadata(org_text)
108
- # Insert winners at the beginning of the list
109
- if metadata.get("LA2050 Grant Winner", "").lower() == "winner":
110
- docs.insert(0, Document(page_content=org_text, metadata=metadata))
111
- else:
112
- docs.append(Document(page_content=org_text, metadata=metadata))
113
- return docs
114
- except Exception as e:
115
- print(f"Error loading JSON: {e}")
116
- return []
117
 
118
- # -------------------------------
119
- # Data Loading and Preprocessing
120
- # -------------------------------
 
 
 
 
 
 
121
 
122
- file_path = './data.json' # Ensure this file is available in your environment.
123
- docs = load_and_process_data(file_path)
124
 
125
  # Use a text splitter to create chunks from the documents.
126
  # (If you find that key fields are getting split, consider implementing a custom splitter.)
127
  from langchain_text_splitters import RecursiveCharacterTextSplitter
128
  text_splitter = RecursiveCharacterTextSplitter(
129
- chunk_size=1600,
130
  chunk_overlap=100,
131
  add_start_index=True
132
  )
133
- all_splits = text_splitter.split_documents(docs)
134
 
135
  # -------------------------------
136
  # Set Up Retrievers
 
31
  # Utility Functions
32
  # -------------------------------
33
 
34
+ def metadata_func(record,additional_fields=None):
35
+ return {
36
+ "title": record.get("Title", ""),
37
+ "organization": record.get("Organization", ""),
38
+ "LA 2050 Grant Status": record.get("Ranking", ""),
39
+ "impact": record.get("Impact Metrics", ""),
40
+ "year": record.get("Year", ""),
41
+ "urls": {
42
+ "website": record.get("Website", ""),
43
+ "twitter": record.get("Twitter", ""),
44
+ "instagram": record.get("Instagram", ""),
45
+ "facebook": record.get("FaceBook", ""),
46
+ "newsletter": record.get("Newsletter", ""),
47
+ "volunteer": record.get("Volunteer", ""),
48
+ "la2050": record.get("LA2050", "")
49
+ },
50
+ "social": {
51
+ "twitter": record.get("Twitter", ""),
52
+ "instagram": record.get("Instagram", ""),
53
+ "facebook": record.get("FaceBook", "")
54
+ },
55
+ "working_area": record.get("Working Areas in LA", ""),
56
+ "zipcode": record.get("Zipcode", "")
57
+ }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58
 
59
+ # Load the JSON data with custom metadata and content key
60
+ loader = JSONLoader(
61
+ file_path='data.json',
62
+ jq_schema='.[]',
63
+ content_key='Summary',
64
+ metadata_func=metadata_func # Pass the metadata_func function directly here
65
+ )
66
+
67
+ data = loader.load()
68
 
 
 
69
 
70
  # Use a text splitter to create chunks from the documents.
71
  # (If you find that key fields are getting split, consider implementing a custom splitter.)
72
  from langchain_text_splitters import RecursiveCharacterTextSplitter
73
  text_splitter = RecursiveCharacterTextSplitter(
74
+ chunk_size=760,
75
  chunk_overlap=100,
76
  add_start_index=True
77
  )
78
+ all_splits = text_splitter.split_documents(data)
79
 
80
  # -------------------------------
81
  # Set Up Retrievers