Spaces:
Sleeping
Sleeping
Update main.py
Browse files
main.py
CHANGED
|
@@ -8,6 +8,7 @@ import asyncio
|
|
| 8 |
import logging
|
| 9 |
from datetime import datetime
|
| 10 |
import os
|
|
|
|
| 11 |
|
| 12 |
# Initialize logging
|
| 13 |
logging.basicConfig(level=logging.INFO)
|
|
@@ -98,7 +99,29 @@ def receipt_radar_prompt(raw_text:str)->str:
|
|
| 98 |
"""
|
| 99 |
return system_prompt
|
| 100 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 101 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 102 |
|
| 103 |
|
| 104 |
async def process_batch_job(dataset: Dict[str, Any], batch_job_id: str):
|
|
@@ -108,19 +131,6 @@ async def process_batch_job(dataset: Dict[str, Any], batch_job_id: str):
|
|
| 108 |
try:
|
| 109 |
logger.info(f"Starting batch processing for job {batch_job_id}")
|
| 110 |
|
| 111 |
-
system_prompt = '''
|
| 112 |
-
Your goal is to extract movie categories from movie descriptions, as well as a 1-sentence summary for these movies.
|
| 113 |
-
You will be provided with a movie description, and you will output a json object containing the following information:
|
| 114 |
-
|
| 115 |
-
{
|
| 116 |
-
categories: string[] // Array of categories based on the movie description,
|
| 117 |
-
summary: string // 1-sentence summary of the movie based on the movie description
|
| 118 |
-
}
|
| 119 |
-
|
| 120 |
-
Categories refer to the genre or type of the movie, like "action", "romance", "comedy", etc. Keep category names simple and use only lower case letters.
|
| 121 |
-
Movies can have several categories, but try to keep it under 3-4. Only mention the categories that are the most obvious based on the description.
|
| 122 |
-
'''
|
| 123 |
-
|
| 124 |
openai_tasks = []
|
| 125 |
for ds in dataset.get('data'):
|
| 126 |
message_id = ds.get('message_id')
|
|
@@ -128,7 +138,8 @@ async def process_batch_job(dataset: Dict[str, Any], batch_job_id: str):
|
|
| 128 |
receipt_text = ds.get('receipt_text')
|
| 129 |
email = ds.get('email')
|
| 130 |
|
| 131 |
-
|
|
|
|
| 132 |
task = {
|
| 133 |
"custom_id": f"{message_id}-{user_id}-{email}",
|
| 134 |
"method": "POST",
|
|
@@ -142,7 +153,7 @@ async def process_batch_job(dataset: Dict[str, Any], batch_job_id: str):
|
|
| 142 |
"messages": [
|
| 143 |
{
|
| 144 |
"role": "user",
|
| 145 |
-
"content":
|
| 146 |
}
|
| 147 |
]
|
| 148 |
}
|
|
@@ -167,18 +178,18 @@ async def process_batch_job(dataset: Dict[str, Any], batch_job_id: str):
|
|
| 167 |
)
|
| 168 |
|
| 169 |
# Update status in Supabase
|
| 170 |
-
supabase.table("batch_processing_details").
|
| 171 |
"batch_job_status": True,
|
| 172 |
"completed_at": datetime.utcnow().isoformat()
|
| 173 |
-
}).
|
| 174 |
|
| 175 |
logger.info(f"Batch job {batch_job_id} processed successfully")
|
| 176 |
|
| 177 |
except Exception as e:
|
| 178 |
logger.error(f"Error processing batch job {batch_job_id}: {str(e)}")
|
| 179 |
# Update status with error
|
| 180 |
-
supabase.table("batch_processing_details").
|
| 181 |
"batch_job_status": False,
|
| 182 |
"error": str(e),
|
| 183 |
"completed_at": datetime.utcnow().isoformat()
|
| 184 |
-
}).
|
|
|
|
| 8 |
import logging
|
| 9 |
from datetime import datetime
|
| 10 |
import os
|
| 11 |
+
import tiktoken
|
| 12 |
|
| 13 |
# Initialize logging
|
| 14 |
logging.basicConfig(level=logging.INFO)
|
|
|
|
| 99 |
"""
|
| 100 |
return system_prompt
|
| 101 |
|
| 102 |
+
def adjust_prompt_tokens_v1(prompt: str) -> str:
|
| 103 |
+
max_tokens = 127500
|
| 104 |
+
encoding = tiktoken.encoding_for_model(LLM_MODEL)
|
| 105 |
+
tokenized_prompt = encoding.encode(prompt)
|
| 106 |
+
|
| 107 |
+
# If token count exceeds max_tokens, trim it from the end while keeping full words
|
| 108 |
+
if len(tokenized_prompt) > max_tokens:
|
| 109 |
+
# Find the maximum index for the tokens that keeps the length within max_tokens
|
| 110 |
+
trimmed_tokens = tokenized_prompt[:max_tokens]
|
| 111 |
+
|
| 112 |
+
# Decode the trimmed tokens back to text
|
| 113 |
+
trimmed_text = encoding.decode(trimmed_tokens)
|
| 114 |
+
|
| 115 |
+
# Ensure we don't end up with a partial word; trim back to the last full word
|
| 116 |
+
last_space = trimmed_text.rfind(' ')
|
| 117 |
+
if last_space != -1:
|
| 118 |
+
trimmed_text = trimmed_text[:last_space]
|
| 119 |
|
| 120 |
+
else:
|
| 121 |
+
# If within the limit, no trimming needed
|
| 122 |
+
trimmed_text = prompt
|
| 123 |
+
|
| 124 |
+
return trimmed_text
|
| 125 |
|
| 126 |
|
| 127 |
async def process_batch_job(dataset: Dict[str, Any], batch_job_id: str):
|
|
|
|
| 131 |
try:
|
| 132 |
logger.info(f"Starting batch processing for job {batch_job_id}")
|
| 133 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 134 |
openai_tasks = []
|
| 135 |
for ds in dataset.get('data'):
|
| 136 |
message_id = ds.get('message_id')
|
|
|
|
| 138 |
receipt_text = ds.get('receipt_text')
|
| 139 |
email = ds.get('email')
|
| 140 |
|
| 141 |
+
text = adjust_prompt_tokens_v1(receipt_radar_prompt(receipt_text))
|
| 142 |
+
|
| 143 |
task = {
|
| 144 |
"custom_id": f"{message_id}-{user_id}-{email}",
|
| 145 |
"method": "POST",
|
|
|
|
| 153 |
"messages": [
|
| 154 |
{
|
| 155 |
"role": "user",
|
| 156 |
+
"content": text
|
| 157 |
}
|
| 158 |
]
|
| 159 |
}
|
|
|
|
| 178 |
)
|
| 179 |
|
| 180 |
# Update status in Supabase
|
| 181 |
+
supabase.table("batch_processing_details").insert({
|
| 182 |
"batch_job_status": True,
|
| 183 |
"completed_at": datetime.utcnow().isoformat()
|
| 184 |
+
}).execute()
|
| 185 |
|
| 186 |
logger.info(f"Batch job {batch_job_id} processed successfully")
|
| 187 |
|
| 188 |
except Exception as e:
|
| 189 |
logger.error(f"Error processing batch job {batch_job_id}: {str(e)}")
|
| 190 |
# Update status with error
|
| 191 |
+
supabase.table("batch_processing_details").insert({
|
| 192 |
"batch_job_status": False,
|
| 193 |
"error": str(e),
|
| 194 |
"completed_at": datetime.utcnow().isoformat()
|
| 195 |
+
}).execute()
|