Spaces:
Sleeping
Sleeping
Update get_gmail_data.py
Browse files- get_gmail_data.py +25 -5
get_gmail_data.py
CHANGED
|
@@ -148,11 +148,22 @@ class GmailDataExtractor:
|
|
| 148 |
|
| 149 |
if 'payload' in message_data and 'parts' in message_data['payload']:
|
| 150 |
parts = message_data['payload']['parts']
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 151 |
for part in parts:
|
| 152 |
if 'mimeType' not in part:
|
| 153 |
continue
|
| 154 |
|
| 155 |
mime_type = part['mimeType']
|
|
|
|
| 156 |
if mime_type == 'text/plain' or mime_type == 'text/html':
|
| 157 |
body_data = part['body'].get('data', '')
|
| 158 |
body = base64.urlsafe_b64decode(body_data).decode('utf-8')
|
|
@@ -166,9 +177,18 @@ class GmailDataExtractor:
|
|
| 166 |
|
| 167 |
if data:
|
| 168 |
# Save only the first 10 characters of the attachment data
|
| 169 |
-
return subject,body ,{"filename":filename , "data":data}
|
|
|
|
|
|
|
| 170 |
|
| 171 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 172 |
|
| 173 |
def extract_text_and_links(html_content: str) -> tuple:
|
| 174 |
"""
|
|
@@ -214,14 +234,14 @@ class GmailDataExtractor:
|
|
| 214 |
messages = self.__fetch_messages()
|
| 215 |
results = []
|
| 216 |
for message in messages:
|
| 217 |
-
subject, body, attachment_data = self.__process_message(message)
|
| 218 |
|
| 219 |
""" Handling None values """
|
| 220 |
-
subject = subject if subject is not None else ""
|
| 221 |
body = body if body is not None else None
|
| 222 |
attachment_data = attachment_data if attachment_data is not None else {}
|
|
|
|
| 223 |
|
| 224 |
-
results.append({"body": body, "attachment_data": [attachment_data]})
|
| 225 |
|
| 226 |
return {"results": results}
|
| 227 |
|
|
|
|
| 148 |
|
| 149 |
if 'payload' in message_data and 'parts' in message_data['payload']:
|
| 150 |
parts = message_data['payload']['parts']
|
| 151 |
+
payload = message_data['payload']
|
| 152 |
+
brand_from_gmail = ''
|
| 153 |
+
company_from_gmail = ''
|
| 154 |
+
if payload['headers']['name'] == 'from':
|
| 155 |
+
brand_from_gmail = payload['headers']['value']
|
| 156 |
+
company_from_gmail = extract_domain_from_email(brand_from_gmail)
|
| 157 |
+
else:
|
| 158 |
+
company_from_gmail = None
|
| 159 |
+
|
| 160 |
+
|
| 161 |
for part in parts:
|
| 162 |
if 'mimeType' not in part:
|
| 163 |
continue
|
| 164 |
|
| 165 |
mime_type = part['mimeType']
|
| 166 |
+
|
| 167 |
if mime_type == 'text/plain' or mime_type == 'text/html':
|
| 168 |
body_data = part['body'].get('data', '')
|
| 169 |
body = base64.urlsafe_b64decode(body_data).decode('utf-8')
|
|
|
|
| 177 |
|
| 178 |
if data:
|
| 179 |
# Save only the first 10 characters of the attachment data
|
| 180 |
+
return subject,body ,{"filename":filename , "data":data} , company_from_gmail
|
| 181 |
+
|
| 182 |
+
return subject, body,None , company_from_gmail
|
| 183 |
|
| 184 |
+
def extract_domain_from_email(email):
|
| 185 |
+
regex = r"@(.+)$"
|
| 186 |
+
match = re.search(regex,email)
|
| 187 |
+
if match :
|
| 188 |
+
return match.group(1)
|
| 189 |
+
else:
|
| 190 |
+
return None
|
| 191 |
+
|
| 192 |
|
| 193 |
def extract_text_and_links(html_content: str) -> tuple:
|
| 194 |
"""
|
|
|
|
| 234 |
messages = self.__fetch_messages()
|
| 235 |
results = []
|
| 236 |
for message in messages:
|
| 237 |
+
subject, body, attachment_data , company_name = self.__process_message(message)
|
| 238 |
|
| 239 |
""" Handling None values """
|
|
|
|
| 240 |
body = body if body is not None else None
|
| 241 |
attachment_data = attachment_data if attachment_data is not None else {}
|
| 242 |
+
company_associated = company_name if company_name is not None else None
|
| 243 |
|
| 244 |
+
results.append({"body": body, "attachment_data": [attachment_data] ,'company_associated':company_associated})
|
| 245 |
|
| 246 |
return {"results": results}
|
| 247 |
|