Spaces:
Sleeping
Sleeping
content_generator v1.0
Browse files- src/image_generation_functions.py +453 -0
- src/pipelines_functions.py +482 -0
- src/streamlit_app.py +723 -38
- src/utils_functions.py +346 -0
src/image_generation_functions.py
ADDED
|
@@ -0,0 +1,453 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import json
|
| 3 |
+
import time
|
| 4 |
+
import re
|
| 5 |
+
import mimetypes
|
| 6 |
+
from io import BytesIO
|
| 7 |
+
from PIL import Image as PILImage
|
| 8 |
+
import google.generativeai as genai
|
| 9 |
+
from google.cloud import storage
|
| 10 |
+
from google import genai as google_genai
|
| 11 |
+
from google.genai import types
|
| 12 |
+
from tenacity import retry, stop_after_attempt, wait_exponential
|
| 13 |
+
from dotenv import load_dotenv
|
| 14 |
+
|
| 15 |
+
load_dotenv()
|
| 16 |
+
|
| 17 |
+
# ============================================================
|
| 18 |
+
# IMAGE GENERATION CONFIGURATION (FIXED - Two separate keys)
|
| 19 |
+
# ============================================================
|
| 20 |
+
|
| 21 |
+
# For text correction (Gemini 2.5 Flash)
|
| 22 |
+
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
|
| 23 |
+
|
| 24 |
+
# For image generation (Gemini 2.5 Flash Image - NEW API)
|
| 25 |
+
IMAGE_API_KEY = os.getenv("IMAGE_API_KEY")
|
| 26 |
+
|
| 27 |
+
GCP_CREDENTIALS_JSON = os.getenv("GCP_CREDENTIALS_JSON")
|
| 28 |
+
GCP_PROJECT_ID = os.getenv("GCP_PROJECT_ID")
|
| 29 |
+
GCP_BUCKET_NAME = os.getenv("GCP_BUCKET_NAME")
|
| 30 |
+
|
| 31 |
+
# Initialize Gemini for correction (old API - works for text)
|
| 32 |
+
if GEMINI_API_KEY:
|
| 33 |
+
genai.configure(api_key=GEMINI_API_KEY)
|
| 34 |
+
else:
|
| 35 |
+
print("β οΈ GEMINI_API_KEY not set - text correction will fail")
|
| 36 |
+
|
| 37 |
+
# Initialize GCP Storage
|
| 38 |
+
try:
|
| 39 |
+
if GCP_CREDENTIALS_JSON and GCP_PROJECT_ID and GCP_BUCKET_NAME:
|
| 40 |
+
import json as json_lib
|
| 41 |
+
from google.oauth2 import service_account
|
| 42 |
+
|
| 43 |
+
credentials_dict = json_lib.loads(GCP_CREDENTIALS_JSON)
|
| 44 |
+
credentials = service_account.Credentials.from_service_account_info(credentials_dict)
|
| 45 |
+
gcp_client = storage.Client(credentials=credentials, project=GCP_PROJECT_ID)
|
| 46 |
+
gcp_bucket = gcp_client.bucket(GCP_BUCKET_NAME)
|
| 47 |
+
GCP_AVAILABLE = True
|
| 48 |
+
print("β GCP Storage configured for image uploads")
|
| 49 |
+
else:
|
| 50 |
+
GCP_AVAILABLE = False
|
| 51 |
+
print("β οΈ GCP credentials not fully configured - image upload disabled")
|
| 52 |
+
except Exception as e:
|
| 53 |
+
GCP_AVAILABLE = False
|
| 54 |
+
print(f"β οΈ GCP configuration error: {e}")
|
| 55 |
+
|
| 56 |
+
# ============================================================
|
| 57 |
+
# AUTOCROP FUNCTION (Proper implementation)
|
| 58 |
+
# ============================================================
|
| 59 |
+
|
| 60 |
+
def autocrop_tight_vertical(image_path, output_path=None):
|
| 61 |
+
"""
|
| 62 |
+
Remove excess white space from top and bottom of image while keeping left/right margins.
|
| 63 |
+
FIXED: Proper PIL implementation with margin preservation.
|
| 64 |
+
"""
|
| 65 |
+
try:
|
| 66 |
+
img = PILImage.open(image_path)
|
| 67 |
+
img_array = img.convert('RGB')
|
| 68 |
+
|
| 69 |
+
# Get image dimensions
|
| 70 |
+
width, height = img_array.size
|
| 71 |
+
|
| 72 |
+
# Define white threshold (pure white or very close)
|
| 73 |
+
white_threshold = 250
|
| 74 |
+
|
| 75 |
+
# Find first non-white row from top
|
| 76 |
+
top_crop = 0
|
| 77 |
+
for y in range(height):
|
| 78 |
+
row_pixels = []
|
| 79 |
+
for x in range(width):
|
| 80 |
+
r, g, b = img_array.getpixel((x, y))
|
| 81 |
+
row_pixels.append((r + g + b) / 3)
|
| 82 |
+
|
| 83 |
+
avg_brightness = sum(row_pixels) / len(row_pixels)
|
| 84 |
+
if avg_brightness < white_threshold:
|
| 85 |
+
top_crop = y
|
| 86 |
+
break
|
| 87 |
+
|
| 88 |
+
# Find first non-white row from bottom
|
| 89 |
+
bottom_crop = height
|
| 90 |
+
for y in range(height - 1, -1, -1):
|
| 91 |
+
row_pixels = []
|
| 92 |
+
for x in range(width):
|
| 93 |
+
r, g, b = img_array.getpixel((x, y))
|
| 94 |
+
row_pixels.append((r + g + b) / 3)
|
| 95 |
+
|
| 96 |
+
avg_brightness = sum(row_pixels) / len(row_pixels)
|
| 97 |
+
if avg_brightness < white_threshold:
|
| 98 |
+
bottom_crop = y + 1
|
| 99 |
+
break
|
| 100 |
+
|
| 101 |
+
# Crop image with small margin
|
| 102 |
+
margin = 10
|
| 103 |
+
top_crop = max(0, top_crop - margin)
|
| 104 |
+
bottom_crop = min(height, bottom_crop + margin)
|
| 105 |
+
|
| 106 |
+
# Make sure we have at least some height
|
| 107 |
+
if bottom_crop <= top_crop:
|
| 108 |
+
print(" β οΈ Autocrop: No content found, returning original")
|
| 109 |
+
return img_array
|
| 110 |
+
|
| 111 |
+
cropped_img = img_array.crop((0, top_crop, width, bottom_crop))
|
| 112 |
+
|
| 113 |
+
if output_path:
|
| 114 |
+
cropped_img.save(output_path)
|
| 115 |
+
|
| 116 |
+
print(f" β Autocropped from {height}px to {cropped_img.size[1]}px")
|
| 117 |
+
return cropped_img
|
| 118 |
+
|
| 119 |
+
except Exception as e:
|
| 120 |
+
print(f"β οΈ Autocrop failed: {e}")
|
| 121 |
+
return None
|
| 122 |
+
|
| 123 |
+
# ============================================================
|
| 124 |
+
# TECHNICAL IMAGE GENERATION (FIXED - NEW API with proper error checking)
|
| 125 |
+
# ============================================================
|
| 126 |
+
|
| 127 |
+
@retry(
|
| 128 |
+
stop=stop_after_attempt(2),
|
| 129 |
+
wait=wait_exponential(multiplier=1, min=3, max=10)
|
| 130 |
+
)
|
| 131 |
+
def generate_technical_image(slide_title, slide_content, image_description):
|
| 132 |
+
"""
|
| 133 |
+
Generate a technical diagram using NEW Gemini 2.5 Flash Image API with streaming.
|
| 134 |
+
FIXED: Using google.genai API with generate_content_stream and proper null checking
|
| 135 |
+
Returns: (success: bool, image_data: bytes or error_message: str)
|
| 136 |
+
"""
|
| 137 |
+
try:
|
| 138 |
+
if not IMAGE_API_KEY:
|
| 139 |
+
return False, "IMAGE_API_KEY not configured"
|
| 140 |
+
|
| 141 |
+
# Initialize client with IMAGE API KEY
|
| 142 |
+
client = google_genai.Client(api_key=IMAGE_API_KEY)
|
| 143 |
+
|
| 144 |
+
# Professional technical prompt
|
| 145 |
+
prompt_text = f"""
|
| 146 |
+
Generate a professional, clean, and visually compelling image for a technical presentation.
|
| 147 |
+
|
| 148 |
+
**Context:**
|
| 149 |
+
This image will be used for a slide titled "{slide_title}" with the following content:
|
| 150 |
+
"{slide_content}"
|
| 151 |
+
|
| 152 |
+
The image should visually represent the concept described below to enhance understanding:
|
| 153 |
+
{image_description}
|
| 154 |
+
|
| 155 |
+
**Critical Requirements:**
|
| 156 |
+
- NO explanatory text, paragraphs, or detailed written descriptions overlaid on the image.
|
| 157 |
+
- Component labels ARE allowed where necessary for clarity (e.g., "API Server", "Worker Node", "Control Plane").
|
| 158 |
+
- Include a brief, centered caption below the image (max 5-7 words, research paper style) summarizing the visual concept.
|
| 159 |
+
- Use full canvas space efficiently β minimize blank margins, maximize information density.
|
| 160 |
+
- Clean, professional, modern aesthetic.
|
| 161 |
+
- Use color strategically to convey meaning and hierarchy.
|
| 162 |
+
- Suitable for a formal technical presentation slide.
|
| 163 |
+
- Prefer abstract/conceptual visualizations over literal images.
|
| 164 |
+
- Ensure all text in the diagram is spell-checked and professionally styled.
|
| 165 |
+
|
| 166 |
+
**Style Guidelines:**
|
| 167 |
+
- Pure white background (#FFFFFF) for professional appearance.
|
| 168 |
+
- Professional color palette optimized for white backgrounds:
|
| 169 |
+
* Primary: Deep navy blue (#1a365d), slate gray (#475569)
|
| 170 |
+
* Accent: Teal (#0d9488), ocean blue (#0284c7)
|
| 171 |
+
- Minimalist and elegant design with balanced spacing.
|
| 172 |
+
- 4:3 aspect ratio (landscape orientation).
|
| 173 |
+
"""
|
| 174 |
+
|
| 175 |
+
print(f" π¨ Generating technical image for: {slide_title}...")
|
| 176 |
+
|
| 177 |
+
# Create content with proper structure
|
| 178 |
+
contents = [types.Content(
|
| 179 |
+
role="user",
|
| 180 |
+
parts=[types.Part.from_text(text=prompt_text)]
|
| 181 |
+
)]
|
| 182 |
+
|
| 183 |
+
# Configure generation with 4:3 aspect ratio
|
| 184 |
+
generate_content_config = types.GenerateContentConfig(
|
| 185 |
+
response_modalities=["IMAGE", "TEXT"],
|
| 186 |
+
image_config=types.ImageConfig(aspect_ratio="4:3", image_size="1K"),
|
| 187 |
+
)
|
| 188 |
+
|
| 189 |
+
# Stream response and extract image
|
| 190 |
+
for chunk in client.models.generate_content_stream(
|
| 191 |
+
model="gemini-2.5-flash-image",
|
| 192 |
+
contents=contents,
|
| 193 |
+
config=generate_content_config
|
| 194 |
+
):
|
| 195 |
+
# ===== FIXED: 5-level null checking as per notebooks =====
|
| 196 |
+
if not chunk.candidates:
|
| 197 |
+
continue
|
| 198 |
+
|
| 199 |
+
candidate = chunk.candidates[0]
|
| 200 |
+
|
| 201 |
+
if not hasattr(candidate, 'content') or candidate.content is None:
|
| 202 |
+
continue
|
| 203 |
+
|
| 204 |
+
if not hasattr(candidate.content, 'parts') or not candidate.content.parts:
|
| 205 |
+
continue
|
| 206 |
+
|
| 207 |
+
part = candidate.content.parts[0]
|
| 208 |
+
|
| 209 |
+
if not hasattr(part, 'inline_data') or part.inline_data is None:
|
| 210 |
+
continue
|
| 211 |
+
|
| 212 |
+
inline_data = part.inline_data
|
| 213 |
+
|
| 214 |
+
if inline_data.data:
|
| 215 |
+
image_data = inline_data.data
|
| 216 |
+
print(f" β
Image generated successfully")
|
| 217 |
+
return True, image_data
|
| 218 |
+
|
| 219 |
+
return False, "No image generated from API"
|
| 220 |
+
|
| 221 |
+
except Exception as e:
|
| 222 |
+
print(f" β Image generation error: {str(e)}")
|
| 223 |
+
return False, f"Error: {str(e)}"
|
| 224 |
+
|
| 225 |
+
# ============================================================
|
| 226 |
+
# OPERATIONAL IMAGE GENERATION (FIXED - NEW API with proper error checking)
|
| 227 |
+
# ============================================================
|
| 228 |
+
|
| 229 |
+
@retry(
|
| 230 |
+
stop=stop_after_attempt(2),
|
| 231 |
+
wait=wait_exponential(multiplier=1, min=3, max=10)
|
| 232 |
+
)
|
| 233 |
+
def generate_operational_image(slide_title, slide_content, image_description):
|
| 234 |
+
"""
|
| 235 |
+
Generate a business/operational diagram using NEW Gemini 2.5 Flash Image API with streaming.
|
| 236 |
+
FIXED: Using google.genai API with generate_content_stream and proper null checking
|
| 237 |
+
Returns: (success: bool, image_data: bytes or error_message: str)
|
| 238 |
+
"""
|
| 239 |
+
try:
|
| 240 |
+
if not IMAGE_API_KEY:
|
| 241 |
+
return False, "IMAGE_API_KEY not configured"
|
| 242 |
+
|
| 243 |
+
# Initialize client with IMAGE API KEY
|
| 244 |
+
client = google_genai.Client(api_key=IMAGE_API_KEY)
|
| 245 |
+
|
| 246 |
+
# Business-focused prompt
|
| 247 |
+
prompt_text = f"""
|
| 248 |
+
Generate a professional, clean business/operational diagram for a compliance or regulatory presentation.
|
| 249 |
+
|
| 250 |
+
**Context:**
|
| 251 |
+
This image will be used for a slide titled "{slide_title}" with the following business content:
|
| 252 |
+
"{slide_content}"
|
| 253 |
+
|
| 254 |
+
The image should visually represent the operational/business/compliance concept described below:
|
| 255 |
+
{image_description}
|
| 256 |
+
|
| 257 |
+
**Critical Requirements:**
|
| 258 |
+
- NO explanatory text, paragraphs, or detailed written descriptions overlaid on the image.
|
| 259 |
+
- Component labels and process flow indicators ARE allowed (e.g., "Compliance Check", "Approval", "Risk Mitigation").
|
| 260 |
+
- Include a brief, centered caption below the image (max 5-7 words, business report style).
|
| 261 |
+
- Use full canvas space efficiently β minimize blank margins.
|
| 262 |
+
- Clean, professional, corporate aesthetic.
|
| 263 |
+
- Use color strategically: consider business standard colors (blue for trust, green for process).
|
| 264 |
+
- Suitable for a formal business presentation or compliance report.
|
| 265 |
+
- Prefer process flows, matrices, or business diagrams.
|
| 266 |
+
|
| 267 |
+
**Style Guidelines:**
|
| 268 |
+
- Pure white background (#FFFFFF).
|
| 269 |
+
- Professional business color palette:
|
| 270 |
+
* Primary: Corporate blue (#003366), professional gray (#4a5568)
|
| 271 |
+
* Accent: Business green (#2d5016), alert red (#c53030)
|
| 272 |
+
- Clean, minimal design with professional spacing.
|
| 273 |
+
- 4:3 aspect ratio (landscape for business presentations).
|
| 274 |
+
"""
|
| 275 |
+
|
| 276 |
+
print(f" π Generating operational image for: {slide_title}...")
|
| 277 |
+
|
| 278 |
+
# Create content with proper structure
|
| 279 |
+
contents = [types.Content(
|
| 280 |
+
role="user",
|
| 281 |
+
parts=[types.Part.from_text(text=prompt_text)]
|
| 282 |
+
)]
|
| 283 |
+
|
| 284 |
+
# Configure generation with 4:3 aspect ratio
|
| 285 |
+
generate_content_config = types.GenerateContentConfig(
|
| 286 |
+
response_modalities=["IMAGE", "TEXT"],
|
| 287 |
+
image_config=types.ImageConfig(aspect_ratio="4:3", image_size="1K"),
|
| 288 |
+
)
|
| 289 |
+
|
| 290 |
+
# Stream response and extract image
|
| 291 |
+
for chunk in client.models.generate_content_stream(
|
| 292 |
+
model="gemini-2.5-flash-image",
|
| 293 |
+
contents=contents,
|
| 294 |
+
config=generate_content_config
|
| 295 |
+
):
|
| 296 |
+
# ===== FIXED: 5-level null checking as per notebooks =====
|
| 297 |
+
if not chunk.candidates:
|
| 298 |
+
continue
|
| 299 |
+
|
| 300 |
+
candidate = chunk.candidates[0]
|
| 301 |
+
|
| 302 |
+
if not hasattr(candidate, 'content') or candidate.content is None:
|
| 303 |
+
continue
|
| 304 |
+
|
| 305 |
+
if not hasattr(candidate.content, 'parts') or not candidate.content.parts:
|
| 306 |
+
continue
|
| 307 |
+
|
| 308 |
+
part = candidate.content.parts[0]
|
| 309 |
+
|
| 310 |
+
if not hasattr(part, 'inline_data') or part.inline_data is None:
|
| 311 |
+
continue
|
| 312 |
+
|
| 313 |
+
inline_data = part.inline_data
|
| 314 |
+
|
| 315 |
+
if inline_data.data:
|
| 316 |
+
image_data = inline_data.data
|
| 317 |
+
print(f" β
Image generated successfully")
|
| 318 |
+
return True, image_data
|
| 319 |
+
|
| 320 |
+
return False, "No image generated from API"
|
| 321 |
+
|
| 322 |
+
except Exception as e:
|
| 323 |
+
print(f" β Image generation error: {str(e)}")
|
| 324 |
+
return False, f"Error: {str(e)}"
|
| 325 |
+
|
| 326 |
+
# ============================================================
|
| 327 |
+
# PIPELINE IMAGE REPLACEMENT (FIXED - Complete integration)
|
| 328 |
+
# ============================================================
|
| 329 |
+
|
| 330 |
+
def process_images_for_pipeline(slide_json, mode="technical"):
|
| 331 |
+
"""
|
| 332 |
+
FIXED: Complete image processing pipeline with proper sequencing.
|
| 333 |
+
|
| 334 |
+
Process all slides with image descriptions:
|
| 335 |
+
1. Generate image with Gemini 2.5 Flash Image
|
| 336 |
+
2. Save temporarily
|
| 337 |
+
3. Autocrop white space
|
| 338 |
+
4. Upload to GCP
|
| 339 |
+
5. Replace image_description with GCP URL
|
| 340 |
+
|
| 341 |
+
Args:
|
| 342 |
+
slide_json: Slides JSON with image_description fields
|
| 343 |
+
mode: "technical" or "operational"
|
| 344 |
+
|
| 345 |
+
Returns:
|
| 346 |
+
Updated slide_json with image_description as GCP URLs
|
| 347 |
+
"""
|
| 348 |
+
|
| 349 |
+
print(f"\n{'='*70}")
|
| 350 |
+
print(f"π¨ STAGE 4: Processing Images ({mode.upper()} Mode)")
|
| 351 |
+
print('='*70)
|
| 352 |
+
|
| 353 |
+
# Create temp folder for intermediate images
|
| 354 |
+
temp_folder = "/tmp/gen_images"
|
| 355 |
+
os.makedirs(temp_folder, exist_ok=True)
|
| 356 |
+
|
| 357 |
+
image_generator = generate_technical_image if mode == "technical" else generate_operational_image
|
| 358 |
+
|
| 359 |
+
for idx, slide in enumerate(slide_json.get('content', []), 1):
|
| 360 |
+
# Skip slides without image descriptions or with null
|
| 361 |
+
if not slide.get('image_description') or slide['image_description'] == "null":
|
| 362 |
+
print(f" β Slide {idx}: No image description")
|
| 363 |
+
continue
|
| 364 |
+
|
| 365 |
+
try:
|
| 366 |
+
slide_title = slide.get('slide_title', 'Slide')
|
| 367 |
+
slide_content = slide.get('slide_content', '')
|
| 368 |
+
image_desc = slide.get('image_description', '')
|
| 369 |
+
|
| 370 |
+
print(f"\n π Processing Slide {idx}: {slide_title}")
|
| 371 |
+
|
| 372 |
+
# STEP 1: Generate image with NEW API
|
| 373 |
+
print(f" 1οΈβ£ Generating image...")
|
| 374 |
+
success, result = image_generator(slide_title, slide_content, image_desc)
|
| 375 |
+
|
| 376 |
+
if not success:
|
| 377 |
+
print(f" β Generation failed: {result}")
|
| 378 |
+
slide['image_description'] = f"Failed: {result}"
|
| 379 |
+
continue
|
| 380 |
+
|
| 381 |
+
image_data = result
|
| 382 |
+
|
| 383 |
+
# STEP 2: Save image temporarily
|
| 384 |
+
print(f" 2οΈβ£ Saving to temporary file...")
|
| 385 |
+
raw_topic = slide_json.get('topic', 'topic')
|
| 386 |
+
topic_slug = re.sub(r'[^a-zA-Z0-9_-]+', '_', raw_topic.strip().lower()).strip('_')
|
| 387 |
+
topic_slug = topic_slug[:15]
|
| 388 |
+
ts = int(time.time())
|
| 389 |
+
temp_file_name = f"slide_{idx}_{topic_slug}_{mode}_{ts}.png"
|
| 390 |
+
temp_file_path = os.path.join(temp_folder, temp_file_name)
|
| 391 |
+
|
| 392 |
+
with open(temp_file_path, 'wb') as f:
|
| 393 |
+
f.write(image_data)
|
| 394 |
+
|
| 395 |
+
print(f" β Saved: {temp_file_name}")
|
| 396 |
+
|
| 397 |
+
# STEP 3: Autocrop white space
|
| 398 |
+
print(f" 3οΈβ£ Autocropping white space...")
|
| 399 |
+
try:
|
| 400 |
+
autocrop_tight_vertical(temp_file_path, temp_file_path)
|
| 401 |
+
print(f" β Autocrop successful")
|
| 402 |
+
except Exception as e:
|
| 403 |
+
print(f" β οΈ Autocrop skipped: {e}")
|
| 404 |
+
|
| 405 |
+
# STEP 4: Upload to GCP
|
| 406 |
+
print(f" 4οΈβ£ Uploading to GCP Storage...")
|
| 407 |
+
image_url = None
|
| 408 |
+
|
| 409 |
+
if GCP_AVAILABLE:
|
| 410 |
+
try:
|
| 411 |
+
with open(temp_file_path, 'rb') as f:
|
| 412 |
+
image_bytes = f.read()
|
| 413 |
+
|
| 414 |
+
gcp_blob_path = f"images/{mode}/{temp_file_name}"
|
| 415 |
+
blob = gcp_bucket.blob(gcp_blob_path)
|
| 416 |
+
blob.upload_from_string(image_bytes, content_type="image/png")
|
| 417 |
+
|
| 418 |
+
image_url = blob.public_url
|
| 419 |
+
print(f" β
Uploaded to GCP: {image_url}")
|
| 420 |
+
|
| 421 |
+
except Exception as e:
|
| 422 |
+
error_str = str(e).lower()
|
| 423 |
+
if 'billing' in error_str or 'project_invalid' in error_str:
|
| 424 |
+
print(f" β οΈ GCP billing not enabled")
|
| 425 |
+
image_url = None
|
| 426 |
+
else:
|
| 427 |
+
print(f" β GCP upload error: {str(e)}")
|
| 428 |
+
image_url = None
|
| 429 |
+
else:
|
| 430 |
+
print(f" β οΈ GCP not configured - cannot upload")
|
| 431 |
+
|
| 432 |
+
# STEP 5: Update slide with URL or error message
|
| 433 |
+
if image_url:
|
| 434 |
+
slide['image_description'] = image_url
|
| 435 |
+
print(f" β
Slide {idx} complete: Image available at GCP URL")
|
| 436 |
+
else:
|
| 437 |
+
slide['image_description'] = "Image generation succeeded but upload unavailable"
|
| 438 |
+
print(f" β οΈ Slide {idx}: Image not uploaded to GCP")
|
| 439 |
+
|
| 440 |
+
# Cleanup temp file
|
| 441 |
+
try:
|
| 442 |
+
os.remove(temp_file_path)
|
| 443 |
+
except:
|
| 444 |
+
pass
|
| 445 |
+
|
| 446 |
+
except Exception as e:
|
| 447 |
+
print(f" β Error processing slide {idx}: {str(e)}")
|
| 448 |
+
slide['image_description'] = f"Error: {str(e)}"
|
| 449 |
+
|
| 450 |
+
print(f"\nβ
Image processing complete")
|
| 451 |
+
return slide_json
|
| 452 |
+
|
| 453 |
+
print("β Image generation functions ready (NEW Gemini 2.5 Flash Image API + proper error checking)")
|
src/pipelines_functions.py
ADDED
|
@@ -0,0 +1,482 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import json
|
| 3 |
+
import re
|
| 4 |
+
from openai import OpenAI
|
| 5 |
+
import google.generativeai as genai
|
| 6 |
+
from tenacity import retry, stop_after_attempt, wait_exponential
|
| 7 |
+
from dotenv import load_dotenv
|
| 8 |
+
|
| 9 |
+
load_dotenv()
|
| 10 |
+
|
| 11 |
+
# ============================================================
|
| 12 |
+
# API INITIALIZATION
|
| 13 |
+
# ============================================================
|
| 14 |
+
|
| 15 |
+
PERPLEXITY_API_KEY = os.getenv("PERPLEXITY_API_KEY")
|
| 16 |
+
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
|
| 17 |
+
|
| 18 |
+
if not PERPLEXITY_API_KEY:
|
| 19 |
+
raise ValueError("β PERPLEXITY_API_KEY not set in .env")
|
| 20 |
+
if not GEMINI_API_KEY:
|
| 21 |
+
raise ValueError("β GEMINI_API_KEY not set in .env")
|
| 22 |
+
|
| 23 |
+
perplexity_client = OpenAI(
|
| 24 |
+
api_key=PERPLEXITY_API_KEY,
|
| 25 |
+
base_url="https://api.perplexity.ai",
|
| 26 |
+
)
|
| 27 |
+
|
| 28 |
+
genai.configure(api_key=GEMINI_API_KEY)
|
| 29 |
+
|
| 30 |
+
# ============================================================
|
| 31 |
+
# TECHNICAL PIPELINE
|
| 32 |
+
# ============================================================
|
| 33 |
+
|
| 34 |
+
@retry(
|
| 35 |
+
stop=stop_after_attempt(3),
|
| 36 |
+
wait=wait_exponential(multiplier=1, min=4, max=10)
|
| 37 |
+
)
|
| 38 |
+
def generate_technical_content(topic):
|
| 39 |
+
"""
|
| 40 |
+
Stage 1: Generate technical slides using Perplexity.
|
| 41 |
+
EXACT PROMPT from technical_gcp_image_pipeline-1.ipynb
|
| 42 |
+
"""
|
| 43 |
+
print(f"\nπ Generating technical content for: {topic}")
|
| 44 |
+
|
| 45 |
+
try:
|
| 46 |
+
system_prompt = f"""You are a domain expert in technology and IT infrastructure with deep knowledge across all technology domains.
|
| 47 |
+
|
| 48 |
+
Task:
|
| 49 |
+
For the topic "{topic}", generate 9 to 10 slides as JSON.
|
| 50 |
+
|
| 51 |
+
Instructions:
|
| 52 |
+
- Write universally applicable content that any technology professional can understand and use.
|
| 53 |
+
- Each slide should have an engaging and concise "slide_title" (maximum 6 words).
|
| 54 |
+
- "slide_content" must be 3-4 sentences (strictly 40-60 words) with technical depth and practical relevance.
|
| 55 |
+
- For the 3 most critical slides ONLY, add "image_description" (strictly 30-40 words) describing specific technical diagrams.
|
| 56 |
+
- First slide: Overview explaining why this technology matters universally.
|
| 57 |
+
- Last slide: "Further Learning & Documentation" with placeholder for 5 curated URLs.
|
| 58 |
+
- Use clear, accessible language. Avoid industry-specific jargon.
|
| 59 |
+
- For all other slides, set image_description to null.
|
| 60 |
+
|
| 61 |
+
Additional Requirement β ALIASES FIELD:
|
| 62 |
+
- Generate 6-7 lowercase alternative names/synonyms for "{topic}".
|
| 63 |
+
- First alias MUST be the normalized lowercase form of the topic.
|
| 64 |
+
- Include abbreviations and common variations.
|
| 65 |
+
|
| 66 |
+
Output ONLY valid JSON (no code blocks, no markdown):
|
| 67 |
+
{{
|
| 68 |
+
"topic": "{topic}",
|
| 69 |
+
"aliases": ["primary lowercase form", "alias2", "alias3", ...],
|
| 70 |
+
"content": [
|
| 71 |
+
{{
|
| 72 |
+
"slide_title": "...",
|
| 73 |
+
"slide_content": "...",
|
| 74 |
+
"image_description": "..." or null
|
| 75 |
+
}}
|
| 76 |
+
],
|
| 77 |
+
"urls": [
|
| 78 |
+
{{"title": "...", "url": "https://..."}},
|
| 79 |
+
...
|
| 80 |
+
]
|
| 81 |
+
}}
|
| 82 |
+
"""
|
| 83 |
+
|
| 84 |
+
response = perplexity_client.chat.completions.create(
|
| 85 |
+
model="sonar-pro",
|
| 86 |
+
messages=[
|
| 87 |
+
{"role": "system", "content": system_prompt},
|
| 88 |
+
{"role": "user", "content": f"Generate a universally applicable technical presentation on {topic}"}
|
| 89 |
+
],
|
| 90 |
+
temperature=0.5,
|
| 91 |
+
max_tokens=4000,
|
| 92 |
+
timeout=60,
|
| 93 |
+
)
|
| 94 |
+
|
| 95 |
+
content = response.choices[0].message.content
|
| 96 |
+
|
| 97 |
+
try:
|
| 98 |
+
result = json.loads(content)
|
| 99 |
+
if 'aliases' not in result:
|
| 100 |
+
result['aliases'] = [topic.lower().strip()]
|
| 101 |
+
print(f"β
Generation successful - {len(result.get('content', []))} slides")
|
| 102 |
+
return result
|
| 103 |
+
except json.JSONDecodeError:
|
| 104 |
+
json_match = re.search(r'\{.*\}', content, re.DOTALL)
|
| 105 |
+
if json_match:
|
| 106 |
+
result = json.loads(json_match.group())
|
| 107 |
+
if 'aliases' not in result:
|
| 108 |
+
result['aliases'] = [topic.lower().strip()]
|
| 109 |
+
return result
|
| 110 |
+
raise ValueError("Could not parse JSON from response")
|
| 111 |
+
|
| 112 |
+
except Exception as e:
|
| 113 |
+
print(f"β Generation failed: {type(e).__name__}: {str(e)}")
|
| 114 |
+
raise
|
| 115 |
+
|
| 116 |
+
@retry(
|
| 117 |
+
stop=stop_after_attempt(2),
|
| 118 |
+
wait=wait_exponential(multiplier=1, min=3, max=10)
|
| 119 |
+
)
|
| 120 |
+
def correct_technical_content(generated_json):
|
| 121 |
+
"""
|
| 122 |
+
Stage 2: Correct with Gemini 2.5 Flash (TEXT ONLY).
|
| 123 |
+
EXACT PROMPT from technical_gcp_image_pipeline-1.ipynb
|
| 124 |
+
"""
|
| 125 |
+
print(f"\nπ Correcting technical content with Gemini 2.5 Flash")
|
| 126 |
+
|
| 127 |
+
try:
|
| 128 |
+
gemini_model = genai.GenerativeModel("gemini-2.5-flash")
|
| 129 |
+
|
| 130 |
+
correction_prompt = f"""You are an expert technical editor for universal technology training materials.
|
| 131 |
+
|
| 132 |
+
Review the following slide presentation and improve it:
|
| 133 |
+
|
| 134 |
+
{json.dumps(generated_json, indent=2)}
|
| 135 |
+
|
| 136 |
+
Your tasks:
|
| 137 |
+
1. Ensure slide titles are clear, concise (max 6 words) and engaging.
|
| 138 |
+
2. Verify that slide_content is universally applicable.
|
| 139 |
+
3. Check that content flows logically, is technically accurate.
|
| 140 |
+
4. For image_descriptions: Make them specific, actionable, and suitable for technical diagram generation.
|
| 141 |
+
5. Review and enhance URLs - add 2-3 additional high-quality URLs if missing.
|
| 142 |
+
6. Keep all word counts natural and readable.
|
| 143 |
+
|
| 144 |
+
CRITICAL INSTRUCTION:
|
| 145 |
+
- The field "aliases" must remain EXACTLY as provided (do not change it).
|
| 146 |
+
- Keep "image_description" fields exactly as they are.
|
| 147 |
+
- For slides without image_description, set to null.
|
| 148 |
+
- Retain the most educationally valuable 3 slides for images β set the rest to null.
|
| 149 |
+
|
| 150 |
+
OUTPUT REQUIREMENT:
|
| 151 |
+
Return ONLY the corrected JSON in the exact same schema as the input.
|
| 152 |
+
Do not include code fences, markdown, or extra commentary.
|
| 153 |
+
"""
|
| 154 |
+
|
| 155 |
+
response = gemini_model.generate_content(correction_prompt)
|
| 156 |
+
corrected_text = response.text.strip()
|
| 157 |
+
|
| 158 |
+
corrected_text = re.sub(r'^\s*```(?:json)?\s*\n?', '', corrected_text, count=1)
|
| 159 |
+
corrected_text = re.sub(r'\s*```\s*$', '', corrected_text, count=1)
|
| 160 |
+
|
| 161 |
+
try:
|
| 162 |
+
result = json.loads(corrected_text)
|
| 163 |
+
if 'aliases' not in result:
|
| 164 |
+
result['aliases'] = generated_json.get('aliases', [])
|
| 165 |
+
print(f"β
Correction successful")
|
| 166 |
+
return result
|
| 167 |
+
|
| 168 |
+
except json.JSONDecodeError:
|
| 169 |
+
json_match = re.search(r'\{.*\}', corrected_text, re.DOTALL)
|
| 170 |
+
if json_match:
|
| 171 |
+
result = json.loads(json_match.group())
|
| 172 |
+
if 'aliases' not in result:
|
| 173 |
+
result['aliases'] = generated_json.get('aliases', [])
|
| 174 |
+
return result
|
| 175 |
+
|
| 176 |
+
print(f"β οΈ Correction parsing failed - returning original")
|
| 177 |
+
return generated_json
|
| 178 |
+
|
| 179 |
+
except Exception as e:
|
| 180 |
+
print(f"β Correction failed: {type(e).__name__}: {str(e)}")
|
| 181 |
+
raise
|
| 182 |
+
|
| 183 |
+
@retry(
|
| 184 |
+
stop=stop_after_attempt(2),
|
| 185 |
+
wait=wait_exponential(multiplier=1, min=3, max=10)
|
| 186 |
+
)
|
| 187 |
+
def refine_technical_content(validated_json):
|
| 188 |
+
"""
|
| 189 |
+
Stage 3: Final refinement with Perplexity.
|
| 190 |
+
EXACT PROMPT from technical_gcp_image_pipeline-1.ipynb
|
| 191 |
+
"""
|
| 192 |
+
print(f"\nπ Refining technical content")
|
| 193 |
+
|
| 194 |
+
try:
|
| 195 |
+
refine_prompt = f"""You are a senior technical content specialist for universal technology training.
|
| 196 |
+
|
| 197 |
+
This slide presentation has been validated. Perform the final refinement:
|
| 198 |
+
|
| 199 |
+
{json.dumps(validated_json, indent=2)}
|
| 200 |
+
|
| 201 |
+
Your tasks:
|
| 202 |
+
1. Ensure image_descriptions are detailed, specific, and suitable for technical diagram generation.
|
| 203 |
+
2. Verify that slide content is universally applicable and consistent.
|
| 204 |
+
3. Confirm that all technical terms are accurate.
|
| 205 |
+
4. Review and refine the URLs:
|
| 206 |
+
- Select up to 5 of the best URLs only.
|
| 207 |
+
- Order them by: Authority, Relevance, Learning value, Diversity.
|
| 208 |
+
- Ensure all chosen URLs are authoritative and current.
|
| 209 |
+
5. Keep all slide content exactly the same length/style.
|
| 210 |
+
6. Maintain perfect JSON structure.
|
| 211 |
+
|
| 212 |
+
CRITICAL INSTRUCTION:
|
| 213 |
+
- The field "aliases" must remain EXACTLY as provided.
|
| 214 |
+
- Keep "image_description" fields for image generation.
|
| 215 |
+
|
| 216 |
+
OUTPUT REQUIREMENT:
|
| 217 |
+
Return ONLY the refined JSON in the exact same schema as the input.
|
| 218 |
+
"""
|
| 219 |
+
|
| 220 |
+
response = perplexity_client.chat.completions.create(
|
| 221 |
+
model="sonar-pro",
|
| 222 |
+
messages=[{"role": "user", "content": refine_prompt}],
|
| 223 |
+
temperature=0.3,
|
| 224 |
+
max_tokens=4000,
|
| 225 |
+
timeout=60,
|
| 226 |
+
)
|
| 227 |
+
|
| 228 |
+
refined_text = response.choices[0].message.content.strip()
|
| 229 |
+
|
| 230 |
+
refined_text = re.sub(r'^\s*```(?:json)?\s*\n?', '', refined_text, count=1)
|
| 231 |
+
refined_text = re.sub(r'\s*```\s*$', '', refined_text, count=1)
|
| 232 |
+
|
| 233 |
+
try:
|
| 234 |
+
result = json.loads(refined_text)
|
| 235 |
+
if 'aliases' not in result:
|
| 236 |
+
result['aliases'] = validated_json.get('aliases', [])
|
| 237 |
+
print(f"β
Refinement successful")
|
| 238 |
+
return result
|
| 239 |
+
|
| 240 |
+
except json.JSONDecodeError:
|
| 241 |
+
json_match = re.search(r'\{.*\}', refined_text, re.DOTALL)
|
| 242 |
+
if json_match:
|
| 243 |
+
result = json.loads(json_match.group())
|
| 244 |
+
if 'aliases' not in result:
|
| 245 |
+
result['aliases'] = validated_json.get('aliases', [])
|
| 246 |
+
return result
|
| 247 |
+
|
| 248 |
+
print(f"β οΈ Refinement failed - returning validated content")
|
| 249 |
+
return validated_json
|
| 250 |
+
|
| 251 |
+
except Exception as e:
|
| 252 |
+
print(f"β Refinement failed: {type(e).__name__}: {str(e)}")
|
| 253 |
+
raise
|
| 254 |
+
|
| 255 |
+
# ============================================================
|
| 256 |
+
# OPERATIONAL PIPELINE
|
| 257 |
+
# ============================================================
|
| 258 |
+
|
| 259 |
+
@retry(
|
| 260 |
+
stop=stop_after_attempt(3),
|
| 261 |
+
wait=wait_exponential(multiplier=1, min=4, max=10)
|
| 262 |
+
)
|
| 263 |
+
def generate_operational_content(topic):
|
| 264 |
+
"""
|
| 265 |
+
Stage 1: Generate operational slides using Perplexity.
|
| 266 |
+
EXACT PROMPT from operational_gcp_image_pipeline-2.ipynb
|
| 267 |
+
"""
|
| 268 |
+
print(f"\nπ Generating operational content for: {topic}")
|
| 269 |
+
|
| 270 |
+
try:
|
| 271 |
+
system_prompt = f"""You are a domain expert in business operations, compliance, regulatory frameworks, and enterprise management.
|
| 272 |
+
|
| 273 |
+
Task:
|
| 274 |
+
For the topic "{topic}", generate 9 to 10 slides as JSON.
|
| 275 |
+
|
| 276 |
+
Instructions:
|
| 277 |
+
- Target intermediate professionals (2+ years experience) seeking actionable, scenario-driven insights.
|
| 278 |
+
- Each slide should have a unique and engaging "slide_title" (maximum 6 words).
|
| 279 |
+
- "slide_content" must be 3-4 sentences (strictly 40-60 words), balancing regulatory requirements with operational business value.
|
| 280 |
+
- Emphasize both regulatory drivers AND business impact: compliance obligations, operational efficiency, risk mitigation, and competitive advantage.
|
| 281 |
+
- For the 3 most important slides ONLY, add "image_description" (strictly 30-40 words) describing meaningful business/operational diagrams.
|
| 282 |
+
- First slide: Overview positioning the topic's regulatory importance and business operational impact.
|
| 283 |
+
- Last slide: "Further Learning & Documentation" with specific next learning topics.
|
| 284 |
+
- Use clear, accessible language without basic dictionary definitions.
|
| 285 |
+
- Focus on practical application, regulatory compliance, and business outcomes.
|
| 286 |
+
- For all other slides, set image_description to null.
|
| 287 |
+
|
| 288 |
+
Additional Requirement β ALIASES FIELD:
|
| 289 |
+
- Generate 4-5 lowercase alternative names/synonyms for "{topic}".
|
| 290 |
+
- First alias MUST be the normalized lowercase form of the topic.
|
| 291 |
+
- Include abbreviations and terms that refer to the same concept.
|
| 292 |
+
|
| 293 |
+
Output ONLY valid JSON (no code blocks, no markdown):
|
| 294 |
+
{{
|
| 295 |
+
"topic": "{topic}",
|
| 296 |
+
"aliases": ["primary lowercase form", "alias2", ...],
|
| 297 |
+
"content": [
|
| 298 |
+
{{
|
| 299 |
+
"slide_title": "...",
|
| 300 |
+
"slide_content": "...",
|
| 301 |
+
"image_description": "..." or null
|
| 302 |
+
}}
|
| 303 |
+
],
|
| 304 |
+
"urls": [
|
| 305 |
+
{{"title": "...", "url": "https://..."}},
|
| 306 |
+
...
|
| 307 |
+
]
|
| 308 |
+
}}
|
| 309 |
+
"""
|
| 310 |
+
|
| 311 |
+
response = perplexity_client.chat.completions.create(
|
| 312 |
+
model="sonar-pro",
|
| 313 |
+
messages=[
|
| 314 |
+
{"role": "system", "content": system_prompt},
|
| 315 |
+
{"role": "user", "content": f"Generate an intermediate-level, practical business operations presentation on: {topic}"}
|
| 316 |
+
],
|
| 317 |
+
temperature=0.5,
|
| 318 |
+
max_tokens=4000,
|
| 319 |
+
timeout=60,
|
| 320 |
+
)
|
| 321 |
+
|
| 322 |
+
content = response.choices[0].message.content
|
| 323 |
+
|
| 324 |
+
try:
|
| 325 |
+
result = json.loads(content)
|
| 326 |
+
if 'aliases' not in result:
|
| 327 |
+
result['aliases'] = [topic.lower().strip()]
|
| 328 |
+
print(f"β
Generation successful - {len(result.get('content', []))} slides")
|
| 329 |
+
return result
|
| 330 |
+
except json.JSONDecodeError:
|
| 331 |
+
json_match = re.search(r'\{.*\}', content, re.DOTALL)
|
| 332 |
+
if json_match:
|
| 333 |
+
result = json.loads(json_match.group())
|
| 334 |
+
if 'aliases' not in result:
|
| 335 |
+
result['aliases'] = [topic.lower().strip()]
|
| 336 |
+
return result
|
| 337 |
+
raise ValueError("Could not parse JSON from response")
|
| 338 |
+
|
| 339 |
+
except Exception as e:
|
| 340 |
+
print(f"β Generation failed: {type(e).__name__}: {str(e)}")
|
| 341 |
+
raise
|
| 342 |
+
|
| 343 |
+
@retry(
|
| 344 |
+
stop=stop_after_attempt(2),
|
| 345 |
+
wait=wait_exponential(multiplier=1, min=3, max=10)
|
| 346 |
+
)
|
| 347 |
+
def correct_operational_content(generated_json):
|
| 348 |
+
"""
|
| 349 |
+
Stage 2: Correct with Gemini 2.5 PRO (stronger model for operational).
|
| 350 |
+
EXACT PROMPT from operational_gcp_image_pipeline-2.ipynb
|
| 351 |
+
"""
|
| 352 |
+
print(f"\nπ Correcting operational content with Gemini 2.5 PRO")
|
| 353 |
+
|
| 354 |
+
try:
|
| 355 |
+
gemini_model = genai.GenerativeModel("gemini-2.5-pro") # STRONGER MODEL FOR OPERATIONAL
|
| 356 |
+
|
| 357 |
+
correction_prompt = f"""You are an expert business operations and compliance editor.
|
| 358 |
+
|
| 359 |
+
Review this business operations presentation and improve it:
|
| 360 |
+
|
| 361 |
+
{json.dumps(generated_json, indent=2)}
|
| 362 |
+
|
| 363 |
+
Your tasks:
|
| 364 |
+
1. Ensure slide titles are clear, concise (max 6 words), and business-focused.
|
| 365 |
+
2. Verify slide_content balances regulatory requirements WITH business operational value (40β60 words).
|
| 366 |
+
3. Strengthen regulatory references: name specific acts, frameworks, or compliance concepts.
|
| 367 |
+
4. For image_descriptions: Make them specific to business processes and regulatory workflows.
|
| 368 |
+
5. Review and improve the URLs - add 2-3 additional high-quality official URLs.
|
| 369 |
+
6. Maintain the intermediate professional tone.
|
| 370 |
+
7. Ensure logical flow: regulatory β operational β actionable insights.
|
| 371 |
+
|
| 372 |
+
CRITICAL INSTRUCTION:
|
| 373 |
+
- The field "aliases" must remain EXACTLY as provided.
|
| 374 |
+
- Keep "image_description" fields for image generation.
|
| 375 |
+
- For slides without image_description, set to null.
|
| 376 |
+
- Retain the most important 3 slides for images β set the rest to null.
|
| 377 |
+
|
| 378 |
+
OUTPUT REQUIREMENT:
|
| 379 |
+
Return ONLY the corrected JSON in the exact same schema as the input.
|
| 380 |
+
"""
|
| 381 |
+
|
| 382 |
+
response = gemini_model.generate_content(correction_prompt)
|
| 383 |
+
corrected_text = response.text.strip()
|
| 384 |
+
|
| 385 |
+
corrected_text = re.sub(r'^\s*```(?:json)?\s*\n?', '', corrected_text, count=1)
|
| 386 |
+
corrected_text = re.sub(r'\s*```\s*$', '', corrected_text, count=1)
|
| 387 |
+
|
| 388 |
+
try:
|
| 389 |
+
result = json.loads(corrected_text)
|
| 390 |
+
if 'aliases' not in result:
|
| 391 |
+
result['aliases'] = generated_json.get('aliases', [])
|
| 392 |
+
print(f"β
Correction successful")
|
| 393 |
+
return result
|
| 394 |
+
|
| 395 |
+
except json.JSONDecodeError:
|
| 396 |
+
json_match = re.search(r'\{.*\}', corrected_text, re.DOTALL)
|
| 397 |
+
if json_match:
|
| 398 |
+
result = json.loads(json_match.group())
|
| 399 |
+
if 'aliases' not in result:
|
| 400 |
+
result['aliases'] = generated_json.get('aliases', [])
|
| 401 |
+
return result
|
| 402 |
+
|
| 403 |
+
print(f"β οΈ Correction parsing failed - returning original")
|
| 404 |
+
return generated_json
|
| 405 |
+
|
| 406 |
+
except Exception as e:
|
| 407 |
+
print(f"β Correction failed: {type(e).__name__}: {str(e)}")
|
| 408 |
+
raise
|
| 409 |
+
|
| 410 |
+
@retry(
|
| 411 |
+
stop=stop_after_attempt(2),
|
| 412 |
+
wait=wait_exponential(multiplier=1, min=3, max=10)
|
| 413 |
+
)
|
| 414 |
+
def refine_operational_content(validated_json):
|
| 415 |
+
"""
|
| 416 |
+
Stage 3: Final refinement with Perplexity.
|
| 417 |
+
EXACT PROMPT from operational_gcp_image_pipeline-2.ipynb
|
| 418 |
+
"""
|
| 419 |
+
print(f"\nπ Refining operational content")
|
| 420 |
+
|
| 421 |
+
try:
|
| 422 |
+
refine_prompt = f"""You are a senior business operations content specialist.
|
| 423 |
+
|
| 424 |
+
This business operations presentation has been validated. Perform the final refinement:
|
| 425 |
+
|
| 426 |
+
{json.dumps(validated_json, indent=2)}
|
| 427 |
+
|
| 428 |
+
Your tasks:
|
| 429 |
+
1. Ensure image descriptions are specific to business workflows, compliance processes, and decision-making.
|
| 430 |
+
2. Verify slide content emphasizes actionable business value, regulatory relevance, and measurable outcomes.
|
| 431 |
+
3. Confirm terminology is accurate, consistent, and understandable to intermediate business professionals.
|
| 432 |
+
4. Review and refine the URLs:
|
| 433 |
+
- Select up to 5 of the best URLs only.
|
| 434 |
+
- Order by: Authority (regulatory bodies first), Relevance, Learning value, Diversity.
|
| 435 |
+
- Ensure all URLs are authoritative, recent, and relevant.
|
| 436 |
+
5. Keep all slide content exactly the same.
|
| 437 |
+
6. Maintain perfect JSON structure.
|
| 438 |
+
|
| 439 |
+
CRITICAL INSTRUCTION:
|
| 440 |
+
- The field "aliases" must remain EXACTLY as provided.
|
| 441 |
+
- Keep "image_description" fields for image generation.
|
| 442 |
+
|
| 443 |
+
OUTPUT REQUIREMENT:
|
| 444 |
+
Return ONLY the refined JSON in the exact same schema as the input.
|
| 445 |
+
"""
|
| 446 |
+
|
| 447 |
+
response = perplexity_client.chat.completions.create(
|
| 448 |
+
model="sonar-pro",
|
| 449 |
+
messages=[{"role": "user", "content": refine_prompt}],
|
| 450 |
+
temperature=0.3,
|
| 451 |
+
max_tokens=4000,
|
| 452 |
+
timeout=60,
|
| 453 |
+
)
|
| 454 |
+
|
| 455 |
+
refined_text = response.choices[0].message.content.strip()
|
| 456 |
+
|
| 457 |
+
refined_text = re.sub(r'^\s*```(?:json)?\s*\n?', '', refined_text, count=1)
|
| 458 |
+
refined_text = re.sub(r'\s*```\s*$', '', refined_text, count=1)
|
| 459 |
+
|
| 460 |
+
try:
|
| 461 |
+
result = json.loads(refined_text)
|
| 462 |
+
if 'aliases' not in result:
|
| 463 |
+
result['aliases'] = validated_json.get('aliases', [])
|
| 464 |
+
print(f"β
Refinement successful")
|
| 465 |
+
return result
|
| 466 |
+
|
| 467 |
+
except json.JSONDecodeError:
|
| 468 |
+
json_match = re.search(r'\{.*\}', refined_text, re.DOTALL)
|
| 469 |
+
if json_match:
|
| 470 |
+
result = json.loads(json_match.group())
|
| 471 |
+
if 'aliases' not in result:
|
| 472 |
+
result['aliases'] = validated_json.get('aliases', [])
|
| 473 |
+
return result
|
| 474 |
+
|
| 475 |
+
print(f"β οΈ Refinement failed - returning validated content")
|
| 476 |
+
return validated_json
|
| 477 |
+
|
| 478 |
+
except Exception as e:
|
| 479 |
+
print(f"β Refinement failed: {type(e).__name__}: {str(e)}")
|
| 480 |
+
raise
|
| 481 |
+
|
| 482 |
+
print("β All pipeline functions loaded (Perplexity + Gemini 2.5 Flash/Pro for text)")
|
src/streamlit_app.py
CHANGED
|
@@ -1,40 +1,725 @@
|
|
| 1 |
-
import altair as alt
|
| 2 |
-
import numpy as np
|
| 3 |
-
import pandas as pd
|
| 4 |
import streamlit as st
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
|
| 6 |
-
|
| 7 |
-
#
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
"
|
| 29 |
-
"
|
| 30 |
-
"
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import streamlit as st
|
| 2 |
+
import json
|
| 3 |
+
import os
|
| 4 |
+
from datetime import datetime
|
| 5 |
+
from dotenv import load_dotenv
|
| 6 |
|
| 7 |
+
|
| 8 |
+
# Import custom functions
|
| 9 |
+
from pipelines_functions import (
|
| 10 |
+
generate_technical_content, correct_technical_content, refine_technical_content,
|
| 11 |
+
generate_operational_content, correct_operational_content, refine_operational_content
|
| 12 |
+
)
|
| 13 |
+
from utils_functions import (
|
| 14 |
+
validate_and_sanitize_topic, check_cache, save_to_cache, validate_and_select_urls,
|
| 15 |
+
get_collections, PipelineMetrics
|
| 16 |
+
)
|
| 17 |
+
from image_generation_functions import process_images_for_pipeline
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
load_dotenv()
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
# ============================================================
|
| 24 |
+
# PAGE CONFIGURATION
|
| 25 |
+
# ============================================================
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
st.set_page_config(
|
| 29 |
+
page_title="LearnOnTheGo",
|
| 30 |
+
page_icon="π",
|
| 31 |
+
layout="wide",
|
| 32 |
+
initial_sidebar_state="collapsed"
|
| 33 |
+
)
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
# ============================================================
|
| 37 |
+
# HELPER FUNCTIONS
|
| 38 |
+
# ============================================================
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
def sanitize_for_html(raw_text: str) -> str:
|
| 42 |
+
"""Escape HTML special characters for safe embedding."""
|
| 43 |
+
if not isinstance(raw_text, str):
|
| 44 |
+
raw_text = str(raw_text)
|
| 45 |
+
return raw_text.replace("&", "&").replace("<", "<").replace(">", ">")
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
def detect_code_content(text: str) -> bool:
|
| 49 |
+
"""Detect if content looks like code (has HTML tags, brackets, etc)."""
|
| 50 |
+
code_indicators = ['<div', '<html', 'class=', 'style=', '{', '}', 'function', 'import', 'def ']
|
| 51 |
+
return any(indicator in text for indicator in code_indicators)
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
# ============================================================
|
| 55 |
+
# CUSTOM CSS - FINAL VERSION WITH SEPARATED PROGRESS BAR
|
| 56 |
+
# ============================================================
|
| 57 |
+
|
| 58 |
+
|
| 59 |
+
st.markdown("""
|
| 60 |
+
<style>
|
| 61 |
+
/* Root color palette */
|
| 62 |
+
:root {
|
| 63 |
+
--primary-blue: #2563eb;
|
| 64 |
+
--accent-teal: #0891b2;
|
| 65 |
+
--light-blue: #eff6ff;
|
| 66 |
+
--light-teal: #e0f2fe;
|
| 67 |
+
--text-dark: #1e293b;
|
| 68 |
+
--text-light: #64748b;
|
| 69 |
+
--border-color: #bae6fd;
|
| 70 |
+
--shadow: 0 8px 24px rgba(37, 99, 235, 0.12);
|
| 71 |
+
--shadow-lg: 0 20px 50px rgba(37, 99, 235, 0.25);
|
| 72 |
+
}
|
| 73 |
+
|
| 74 |
+
/* Overall app styling */
|
| 75 |
+
.stApp {
|
| 76 |
+
background: linear-gradient(135deg, #dbeafe 0%, #cffafe 100%);
|
| 77 |
+
}
|
| 78 |
+
|
| 79 |
+
/* Hide default Streamlit elements */
|
| 80 |
+
#MainMenu {visibility: hidden;}
|
| 81 |
+
footer {visibility: hidden;}
|
| 82 |
+
header {visibility: hidden;}
|
| 83 |
+
|
| 84 |
+
/* Main container */
|
| 85 |
+
.block-container {
|
| 86 |
+
max-width: 1400px;
|
| 87 |
+
padding-top: 2rem;
|
| 88 |
+
padding-bottom: 2rem;
|
| 89 |
+
}
|
| 90 |
+
|
| 91 |
+
/* Header styling */
|
| 92 |
+
.header-container {
|
| 93 |
+
background: linear-gradient(110deg, var(--primary-blue) 0%, var(--accent-teal) 100%);
|
| 94 |
+
padding: 60px 30px;
|
| 95 |
+
border-radius: 28px;
|
| 96 |
+
text-align: center;
|
| 97 |
+
margin-bottom: 40px;
|
| 98 |
+
box-shadow: var(--shadow-lg);
|
| 99 |
+
}
|
| 100 |
+
|
| 101 |
+
.header-container h1 {
|
| 102 |
+
color: white;
|
| 103 |
+
font-size: 62px;
|
| 104 |
+
margin: 0;
|
| 105 |
+
font-weight: 900;
|
| 106 |
+
letter-spacing: 3px;
|
| 107 |
+
text-shadow: 0 4px 12px rgba(0, 0, 0, 0.2);
|
| 108 |
+
}
|
| 109 |
+
|
| 110 |
+
/* Search box styling */
|
| 111 |
+
.search-container {
|
| 112 |
+
background: linear-gradient(135deg, #ffffff 0%, #f8fafc 100%);
|
| 113 |
+
padding: 35px;
|
| 114 |
+
border-radius: 24px;
|
| 115 |
+
margin-bottom: 30px;
|
| 116 |
+
border: 3px solid var(--border-color);
|
| 117 |
+
box-shadow: var(--shadow);
|
| 118 |
+
}
|
| 119 |
+
|
| 120 |
+
/* Text input styling */
|
| 121 |
+
.stTextInput > div > div > input {
|
| 122 |
+
border: 3px solid var(--accent-teal) !important;
|
| 123 |
+
border-radius: 14px !important;
|
| 124 |
+
padding: 16px 20px !important;
|
| 125 |
+
font-size: 17px !important;
|
| 126 |
+
background-color: white !important;
|
| 127 |
+
transition: all 0.3s ease !important;
|
| 128 |
+
}
|
| 129 |
+
|
| 130 |
+
.stTextInput > div > div > input:focus {
|
| 131 |
+
border-color: var(--primary-blue) !important;
|
| 132 |
+
box-shadow: 0 0 0 4px rgba(37, 99, 235, 0.15) !important;
|
| 133 |
+
outline: none !important;
|
| 134 |
+
}
|
| 135 |
+
|
| 136 |
+
.stTextInput > div > div > input::placeholder {
|
| 137 |
+
color: rgba(100, 116, 139, 0.6) !important;
|
| 138 |
+
font-weight: 500 !important;
|
| 139 |
+
}
|
| 140 |
+
|
| 141 |
+
/* Radio container */
|
| 142 |
+
.stRadio > div[role="radiogroup"] {
|
| 143 |
+
display: flex !important;
|
| 144 |
+
gap: 0 !important;
|
| 145 |
+
background: #e0e7ff !important;
|
| 146 |
+
border-radius: 14px !important;
|
| 147 |
+
padding: 4px !important;
|
| 148 |
+
border: 3px solid var(--border-color) !important;
|
| 149 |
+
width: fit-content !important;
|
| 150 |
+
margin: 0 auto !important;
|
| 151 |
+
}
|
| 152 |
+
|
| 153 |
+
/* Individual radio labels */
|
| 154 |
+
.stRadio > div[role="radiogroup"] > label {
|
| 155 |
+
background: transparent !important;
|
| 156 |
+
border: none !important;
|
| 157 |
+
padding: 12px 32px !important;
|
| 158 |
+
border-radius: 10px !important;
|
| 159 |
+
cursor: pointer !important;
|
| 160 |
+
transition: all 0.3s ease !important;
|
| 161 |
+
font-weight: 700 !important;
|
| 162 |
+
color: var(--text-light) !important;
|
| 163 |
+
font-size: 15px !important;
|
| 164 |
+
text-align: center !important;
|
| 165 |
+
min-width: 140px !important;
|
| 166 |
+
margin: 0 !important;
|
| 167 |
+
flex: 1 !important;
|
| 168 |
+
}
|
| 169 |
+
|
| 170 |
+
.stRadio > div[role="radiogroup"] > label:hover {
|
| 171 |
+
background: rgba(255, 255, 255, 0.5) !important;
|
| 172 |
+
}
|
| 173 |
+
|
| 174 |
+
.stRadio > div[role="radiogroup"] > label[data-checked="true"],
|
| 175 |
+
.stRadio > div[role="radiogroup"] > label:has(input:checked),
|
| 176 |
+
.stRadio > div[role="radiogroup"] > label[aria-checked="true"] {
|
| 177 |
+
background: linear-gradient(110deg, var(--primary-blue) 0%, var(--accent-teal) 100%) !important;
|
| 178 |
+
color: white !important;
|
| 179 |
+
box-shadow: 0 4px 12px rgba(37, 99, 235, 0.35) !important;
|
| 180 |
+
}
|
| 181 |
+
|
| 182 |
+
.stRadio input[type="radio"] {
|
| 183 |
+
display: none !important;
|
| 184 |
+
}
|
| 185 |
+
|
| 186 |
+
/* Button styling */
|
| 187 |
+
.stButton > button {
|
| 188 |
+
background: linear-gradient(100deg, var(--primary-blue) 0%, var(--accent-teal) 100%) !important;
|
| 189 |
+
color: white !important;
|
| 190 |
+
font-weight: 800 !important;
|
| 191 |
+
padding: 18px 40px !important;
|
| 192 |
+
border-radius: 14px !important;
|
| 193 |
+
border: none !important;
|
| 194 |
+
transition: all 0.3s ease !important;
|
| 195 |
+
box-shadow: 0 6px 20px rgba(37, 99, 235, 0.3) !important;
|
| 196 |
+
font-size: 17px !important;
|
| 197 |
+
letter-spacing: 0.5px !important;
|
| 198 |
+
}
|
| 199 |
+
|
| 200 |
+
.stButton > button:hover {
|
| 201 |
+
background: linear-gradient(100deg, #1d4ed8 0%, #0e7490 100%) !important;
|
| 202 |
+
transform: translateY(-3px) !important;
|
| 203 |
+
box-shadow: 0 10px 30px rgba(37, 99, 235, 0.4) !important;
|
| 204 |
+
}
|
| 205 |
+
|
| 206 |
+
/* SLIDE BOX - WITHOUT PROGRESS BAR */
|
| 207 |
+
.slide-box-wrapper {
|
| 208 |
+
background: white;
|
| 209 |
+
border-radius: 28px;
|
| 210 |
+
border: 3px solid var(--border-color);
|
| 211 |
+
box-shadow: 0 20px 60px rgba(37, 99, 235, 0.18);
|
| 212 |
+
padding: 50px 45px;
|
| 213 |
+
margin: 40px auto;
|
| 214 |
+
max-width: 1000px;
|
| 215 |
+
animation: slideIn 0.5s ease-out;
|
| 216 |
+
}
|
| 217 |
+
|
| 218 |
+
@keyframes slideIn {
|
| 219 |
+
from {
|
| 220 |
+
opacity: 0;
|
| 221 |
+
transform: translateY(30px);
|
| 222 |
+
}
|
| 223 |
+
to {
|
| 224 |
+
opacity: 1;
|
| 225 |
+
transform: translateY(0);
|
| 226 |
+
}
|
| 227 |
+
}
|
| 228 |
+
|
| 229 |
+
/* Slide content */
|
| 230 |
+
.slide-box-wrapper > * {
|
| 231 |
+
display: block !important;
|
| 232 |
+
width: 100% !important;
|
| 233 |
+
box-sizing: border-box !important;
|
| 234 |
+
}
|
| 235 |
+
|
| 236 |
+
/* Slide title */
|
| 237 |
+
.slide-box-wrapper h2 {
|
| 238 |
+
font-size: 42px;
|
| 239 |
+
font-weight: 900;
|
| 240 |
+
color: var(--primary-blue);
|
| 241 |
+
margin: 0 0 20px 0 !important;
|
| 242 |
+
letter-spacing: 1px;
|
| 243 |
+
line-height: 1.3;
|
| 244 |
+
text-shadow: 0 2px 8px rgba(37, 99, 235, 0.15);
|
| 245 |
+
word-wrap: break-word;
|
| 246 |
+
overflow-wrap: break-word;
|
| 247 |
+
text-align: center;
|
| 248 |
+
}
|
| 249 |
+
|
| 250 |
+
/* Slide text and paragraphs */
|
| 251 |
+
.slide-box-wrapper p {
|
| 252 |
+
font-size: 20px;
|
| 253 |
+
color: var(--text-dark);
|
| 254 |
+
line-height: 2.2;
|
| 255 |
+
margin: 0 0 24px 0 !important;
|
| 256 |
+
font-weight: 500;
|
| 257 |
+
text-align: left;
|
| 258 |
+
padding: 0 20px;
|
| 259 |
+
box-sizing: border-box;
|
| 260 |
+
}
|
| 261 |
+
|
| 262 |
+
/* Code block styling */
|
| 263 |
+
.slide-box-wrapper pre {
|
| 264 |
+
background: #f8fafc;
|
| 265 |
+
border-radius: 12px;
|
| 266 |
+
padding: 20px;
|
| 267 |
+
font-family: 'Consolas', 'Monaco', 'Courier New', monospace;
|
| 268 |
+
font-size: 14px;
|
| 269 |
+
color: #0f172a;
|
| 270 |
+
overflow-x: auto;
|
| 271 |
+
border: 2px solid #e0e7ff;
|
| 272 |
+
margin: 16px 20px !important;
|
| 273 |
+
text-align: left;
|
| 274 |
+
line-height: 1.6;
|
| 275 |
+
white-space: pre-wrap;
|
| 276 |
+
word-wrap: break-word;
|
| 277 |
+
}
|
| 278 |
+
|
| 279 |
+
/* Images inside slide box */
|
| 280 |
+
.slide-box-wrapper img {
|
| 281 |
+
max-width: 90% !important;
|
| 282 |
+
height: auto !important;
|
| 283 |
+
border-radius: 20px !important;
|
| 284 |
+
box-shadow: 0 8px 24px rgba(37, 99, 235, 0.2) !important;
|
| 285 |
+
display: block !important;
|
| 286 |
+
margin: 24px auto !important;
|
| 287 |
+
}
|
| 288 |
+
|
| 289 |
+
/* Learning Resources */
|
| 290 |
+
.resources-section {
|
| 291 |
+
margin-top: 32px;
|
| 292 |
+
padding-top: 28px;
|
| 293 |
+
border-top: 3px solid var(--border-color);
|
| 294 |
+
text-align: left;
|
| 295 |
+
}
|
| 296 |
+
|
| 297 |
+
.resources-section h4 {
|
| 298 |
+
color: var(--primary-blue);
|
| 299 |
+
font-size: 22px;
|
| 300 |
+
margin: 0 0 18px 0 !important;
|
| 301 |
+
font-weight: 800;
|
| 302 |
+
text-align: center;
|
| 303 |
+
}
|
| 304 |
+
|
| 305 |
+
.resources-section a {
|
| 306 |
+
color: var(--accent-teal);
|
| 307 |
+
text-decoration: none;
|
| 308 |
+
font-weight: 600;
|
| 309 |
+
transition: all 0.3s ease;
|
| 310 |
+
display: block;
|
| 311 |
+
padding: 10px 15px;
|
| 312 |
+
font-size: 16px;
|
| 313 |
+
border-radius: 8px;
|
| 314 |
+
margin-bottom: 8px;
|
| 315 |
+
}
|
| 316 |
+
|
| 317 |
+
.resources-section a:hover {
|
| 318 |
+
color: white;
|
| 319 |
+
background: var(--primary-blue);
|
| 320 |
+
padding-left: 20px;
|
| 321 |
+
box-shadow: 0 4px 12px rgba(37, 99, 235, 0.2);
|
| 322 |
+
}
|
| 323 |
+
|
| 324 |
+
/* PROGRESS CONTAINER - MOVED OUTSIDE BOX */
|
| 325 |
+
.progress-container {
|
| 326 |
+
display: flex;
|
| 327 |
+
align-items: center;
|
| 328 |
+
justify-content: center;
|
| 329 |
+
gap: 20px;
|
| 330 |
+
margin: 30px auto;
|
| 331 |
+
padding: 20px 0;
|
| 332 |
+
max-width: 1000px;
|
| 333 |
+
width: 100%;
|
| 334 |
+
}
|
| 335 |
+
|
| 336 |
+
.progress-bar {
|
| 337 |
+
flex: 1;
|
| 338 |
+
max-width: 700px;
|
| 339 |
+
height: 10px;
|
| 340 |
+
background: #e0e7ff;
|
| 341 |
+
border-radius: 12px;
|
| 342 |
+
overflow: hidden;
|
| 343 |
+
box-shadow: inset 0 2px 4px rgba(37, 99, 235, 0.1);
|
| 344 |
+
}
|
| 345 |
+
|
| 346 |
+
.progress-fill {
|
| 347 |
+
height: 100%;
|
| 348 |
+
background: linear-gradient(90deg, var(--primary-blue) 0%, var(--accent-teal) 100%);
|
| 349 |
+
transition: width 0.4s ease;
|
| 350 |
+
box-shadow: 0 0 10px rgba(37, 99, 235, 0.4);
|
| 351 |
+
}
|
| 352 |
+
|
| 353 |
+
/* Slide counter badge */
|
| 354 |
+
.slide-counter-badge {
|
| 355 |
+
background: linear-gradient(110deg, var(--primary-blue) 0%, var(--accent-teal) 100%);
|
| 356 |
+
color: white;
|
| 357 |
+
padding: 10px 20px;
|
| 358 |
+
border-radius: 24px;
|
| 359 |
+
font-size: 16px;
|
| 360 |
+
font-weight: 800;
|
| 361 |
+
min-width: 90px;
|
| 362 |
+
text-align: center;
|
| 363 |
+
box-shadow: 0 4px 12px rgba(37, 99, 235, 0.3);
|
| 364 |
+
white-space: nowrap;
|
| 365 |
+
}
|
| 366 |
+
|
| 367 |
+
/* Footer */
|
| 368 |
+
.footer-bar {
|
| 369 |
+
margin-top: 60px;
|
| 370 |
+
text-align: center;
|
| 371 |
+
color: var(--text-dark);
|
| 372 |
+
font-size: 16px;
|
| 373 |
+
letter-spacing: 0.5px;
|
| 374 |
+
padding: 32px 0;
|
| 375 |
+
border-top: 3px solid var(--border-color);
|
| 376 |
+
background: white;
|
| 377 |
+
border-radius: 20px;
|
| 378 |
+
box-shadow: 0 4px 16px rgba(37, 99, 235, 0.08);
|
| 379 |
+
}
|
| 380 |
+
|
| 381 |
+
.footer-bar p {
|
| 382 |
+
margin: 10px 0;
|
| 383 |
+
font-weight: 600;
|
| 384 |
+
}
|
| 385 |
+
|
| 386 |
+
.footer-bar p:last-child {
|
| 387 |
+
font-size: 14px;
|
| 388 |
+
color: var(--text-light);
|
| 389 |
+
font-weight: 500;
|
| 390 |
+
}
|
| 391 |
+
|
| 392 |
+
/* Alert messages */
|
| 393 |
+
.stSuccess {
|
| 394 |
+
border-radius: 14px !important;
|
| 395 |
+
border-left: 5px solid #10b981 !important;
|
| 396 |
+
background-color: #ecfdf5 !important;
|
| 397 |
+
padding: 16px !important;
|
| 398 |
+
font-weight: 600 !important;
|
| 399 |
+
}
|
| 400 |
+
|
| 401 |
+
.stInfo {
|
| 402 |
+
border-radius: 14px !important;
|
| 403 |
+
border-left: 5px solid var(--primary-blue) !important;
|
| 404 |
+
background-color: #f0f9ff !important;
|
| 405 |
+
padding: 16px !important;
|
| 406 |
+
font-weight: 600 !important;
|
| 407 |
+
}
|
| 408 |
+
|
| 409 |
+
.stError {
|
| 410 |
+
border-radius: 14px !important;
|
| 411 |
+
border-left: 5px solid #ef4444 !important;
|
| 412 |
+
background-color: #fef2f2 !important;
|
| 413 |
+
padding: 16px !important;
|
| 414 |
+
font-weight: 600 !important;
|
| 415 |
+
}
|
| 416 |
+
|
| 417 |
+
/* Mobile responsive */
|
| 418 |
+
@media (max-width: 768px) {
|
| 419 |
+
.header-container h1 {
|
| 420 |
+
font-size: 40px;
|
| 421 |
+
}
|
| 422 |
+
|
| 423 |
+
.slide-box-wrapper {
|
| 424 |
+
padding: 30px 20px;
|
| 425 |
+
}
|
| 426 |
+
|
| 427 |
+
.slide-box-wrapper h2 {
|
| 428 |
+
font-size: 30px;
|
| 429 |
+
}
|
| 430 |
+
|
| 431 |
+
.slide-box-wrapper p {
|
| 432 |
+
font-size: 17px;
|
| 433 |
+
line-height: 1.9;
|
| 434 |
+
padding: 0 10px;
|
| 435 |
+
}
|
| 436 |
+
|
| 437 |
+
.progress-container {
|
| 438 |
+
flex-direction: row;
|
| 439 |
+
gap: 15px;
|
| 440 |
+
margin: 20px auto;
|
| 441 |
+
padding: 15px 0;
|
| 442 |
+
}
|
| 443 |
+
|
| 444 |
+
.progress-bar {
|
| 445 |
+
width: 100%;
|
| 446 |
+
max-width: none;
|
| 447 |
+
}
|
| 448 |
+
}
|
| 449 |
+
</style>
|
| 450 |
+
""", unsafe_allow_html=True)
|
| 451 |
+
|
| 452 |
+
|
| 453 |
+
# ============================================================
|
| 454 |
+
# SESSION STATE INITIALIZATION
|
| 455 |
+
# ============================================================
|
| 456 |
+
|
| 457 |
+
|
| 458 |
+
if "current_slide" not in st.session_state:
|
| 459 |
+
st.session_state.current_slide = 0
|
| 460 |
+
if "slides_data" not in st.session_state:
|
| 461 |
+
st.session_state.slides_data = None
|
| 462 |
+
if "search_query" not in st.session_state:
|
| 463 |
+
st.session_state.search_query = ""
|
| 464 |
+
if "mode" not in st.session_state:
|
| 465 |
+
st.session_state.mode = "technical"
|
| 466 |
+
if "is_loading" not in st.session_state:
|
| 467 |
+
st.session_state.is_loading = False
|
| 468 |
+
if "error_message" not in st.session_state:
|
| 469 |
+
st.session_state.error_message = None
|
| 470 |
+
if "metrics" not in st.session_state:
|
| 471 |
+
st.session_state.metrics = None
|
| 472 |
+
|
| 473 |
+
|
| 474 |
+
# ============================================================
|
| 475 |
+
# PIPELINE FUNCTION
|
| 476 |
+
# ============================================================
|
| 477 |
+
|
| 478 |
+
|
| 479 |
+
def run_pipeline(query, mode):
|
| 480 |
+
"""Execute the 5-stage pipeline with metrics tracking."""
|
| 481 |
+
try:
|
| 482 |
+
metrics = PipelineMetrics(query, mode)
|
| 483 |
+
query = validate_and_sanitize_topic(query)
|
| 484 |
+
|
| 485 |
+
technical_col, operational_col, db = get_collections()
|
| 486 |
+
collection = operational_col if mode == "operational" else technical_col
|
| 487 |
+
|
| 488 |
+
# Cache check
|
| 489 |
+
metrics.start_stage("Cache Check")
|
| 490 |
+
cached_content, is_cached = check_cache(query, collection)
|
| 491 |
+
if is_cached:
|
| 492 |
+
metrics.set_cache_hit("mongodb")
|
| 493 |
+
metrics.end_stage("Cache Check")
|
| 494 |
+
|
| 495 |
+
if is_cached:
|
| 496 |
+
st.session_state.slides_data = cached_content
|
| 497 |
+
st.session_state.current_slide = 0
|
| 498 |
+
metrics.end()
|
| 499 |
+
metrics.save_metrics()
|
| 500 |
+
return True, "β
Retrieved from cache (instant!)"
|
| 501 |
+
|
| 502 |
+
st.session_state.is_loading = True
|
| 503 |
+
|
| 504 |
+
with st.spinner(f"π Generating {mode} content with images (5 stages)..."):
|
| 505 |
+
if mode == "technical":
|
| 506 |
+
metrics.start_stage("Generate")
|
| 507 |
+
generated = generate_technical_content(query)
|
| 508 |
+
metrics.end_stage("Generate", f"{len(generated.get('content', []))} slides")
|
| 509 |
+
|
| 510 |
+
metrics.start_stage("Correct")
|
| 511 |
+
corrected = correct_technical_content(generated)
|
| 512 |
+
metrics.end_stage("Correct", "Content improved")
|
| 513 |
+
|
| 514 |
+
metrics.start_stage("Validate URLs")
|
| 515 |
+
validated, _ = validate_and_select_urls(corrected)
|
| 516 |
+
metrics.end_stage("Validate URLs", f"{len(validated.get('urls', []))} URLs validated")
|
| 517 |
+
|
| 518 |
+
metrics.start_stage("Refine")
|
| 519 |
+
refined = refine_technical_content(validated)
|
| 520 |
+
metrics.end_stage("Refine", "Content refined")
|
| 521 |
+
|
| 522 |
+
metrics.start_stage("Generate Images")
|
| 523 |
+
final_result = process_images_for_pipeline(refined, mode="technical")
|
| 524 |
+
metrics.end_stage("Generate Images", "Images generated")
|
| 525 |
+
else:
|
| 526 |
+
metrics.start_stage("Generate")
|
| 527 |
+
generated = generate_operational_content(query)
|
| 528 |
+
metrics.end_stage("Generate", f"{len(generated.get('content', []))} slides")
|
| 529 |
+
|
| 530 |
+
metrics.start_stage("Correct")
|
| 531 |
+
corrected = correct_operational_content(generated)
|
| 532 |
+
metrics.end_stage("Correct", "Content improved")
|
| 533 |
+
|
| 534 |
+
metrics.start_stage("Validate URLs")
|
| 535 |
+
validated, _ = validate_and_select_urls(corrected)
|
| 536 |
+
metrics.end_stage("Validate URLs", f"{len(validated.get('urls', []))} URLs validated")
|
| 537 |
+
|
| 538 |
+
metrics.start_stage("Refine")
|
| 539 |
+
refined = refine_operational_content(validated)
|
| 540 |
+
metrics.end_stage("Refine", "Content refined")
|
| 541 |
+
|
| 542 |
+
metrics.start_stage("Generate Images")
|
| 543 |
+
final_result = process_images_for_pipeline(refined, mode="operational")
|
| 544 |
+
metrics.end_stage("Generate Images", "Images generated")
|
| 545 |
+
|
| 546 |
+
save_to_cache(query, final_result, collection)
|
| 547 |
+
st.session_state.slides_data = final_result
|
| 548 |
+
st.session_state.current_slide = 0
|
| 549 |
+
st.session_state.is_loading = False
|
| 550 |
+
|
| 551 |
+
pipeline_metrics = metrics.end()
|
| 552 |
+
metrics.save_metrics()
|
| 553 |
+
st.session_state.metrics = pipeline_metrics
|
| 554 |
+
|
| 555 |
+
total_time = pipeline_metrics.get('total_duration_seconds', 0)
|
| 556 |
+
return True, f"β
Generated {len(final_result.get('content', []))} slides in {total_time:.1f}s!"
|
| 557 |
+
|
| 558 |
+
except Exception as e:
|
| 559 |
+
st.session_state.is_loading = False
|
| 560 |
+
st.session_state.error_message = str(e)
|
| 561 |
+
return False, f"β Error: {str(e)}"
|
| 562 |
+
|
| 563 |
+
|
| 564 |
+
# ============================================================
|
| 565 |
+
# DISPLAY SLIDE FUNCTION - PROGRESS BAR OUTSIDE BOX
|
| 566 |
+
# ============================================================
|
| 567 |
+
|
| 568 |
+
|
| 569 |
+
def display_slide(slide_index):
|
| 570 |
+
"""Display current slide with progress bar OUTSIDE the white box."""
|
| 571 |
+
if not st.session_state.slides_data:
|
| 572 |
+
return
|
| 573 |
+
|
| 574 |
+
slides = st.session_state.slides_data.get('content', [])
|
| 575 |
+
if not slides or slide_index >= len(slides):
|
| 576 |
+
return
|
| 577 |
+
|
| 578 |
+
slide = slides[slide_index]
|
| 579 |
+
total_slides = len(slides)
|
| 580 |
+
progress_percent = ((slide_index + 1) / total_slides) * 100
|
| 581 |
+
|
| 582 |
+
# Build the slide box HTML (WITHOUT progress bar)
|
| 583 |
+
title = sanitize_for_html(slide.get("slide_title", ""))
|
| 584 |
+
raw_content = slide.get("slide_content", "")
|
| 585 |
+
|
| 586 |
+
# Determine content type
|
| 587 |
+
if detect_code_content(raw_content):
|
| 588 |
+
sanitized_content = f"<pre>{sanitize_for_html(raw_content)}</pre>"
|
| 589 |
+
else:
|
| 590 |
+
sanitized_content = f"<p>{sanitize_for_html(raw_content)}</p>"
|
| 591 |
+
|
| 592 |
+
# Start building the slide HTML (NO progress bar inside)
|
| 593 |
+
slide_html = f"""
|
| 594 |
+
<div class="slide-box-wrapper">
|
| 595 |
+
<h2>{title}</h2>
|
| 596 |
+
{sanitized_content}
|
| 597 |
+
"""
|
| 598 |
+
|
| 599 |
+
# Add image if available
|
| 600 |
+
img_url = slide.get('image_description')
|
| 601 |
+
if isinstance(img_url, str) and img_url.startswith('http'):
|
| 602 |
+
slide_html += f'<img src="{img_url}" alt="Slide image" style="max-width: 90%; height: auto; display: block; margin: 24px auto; border-radius: 20px; box-shadow: 0 8px 24px rgba(37, 99, 235, 0.2);">'
|
| 603 |
+
|
| 604 |
+
# Add learning resources (last slide only)
|
| 605 |
+
if slide_index == total_slides - 1:
|
| 606 |
+
urls = st.session_state.slides_data.get('urls', [])
|
| 607 |
+
if urls:
|
| 608 |
+
slide_html += '<div class="resources-section"><h4>π Learning Resources</h4>'
|
| 609 |
+
for i, url_obj in enumerate(urls, 1):
|
| 610 |
+
url_title = sanitize_for_html(url_obj.get('title', 'Documentation'))
|
| 611 |
+
url = url_obj.get('url', '#')
|
| 612 |
+
slide_html += f'<a href="{url}" target="_blank">{i}. {url_title}</a>'
|
| 613 |
+
slide_html += '</div>'
|
| 614 |
+
|
| 615 |
+
# Close the slide box (NO progress bar)
|
| 616 |
+
slide_html += '</div>'
|
| 617 |
+
|
| 618 |
+
# Render the slide box
|
| 619 |
+
st.markdown(slide_html, unsafe_allow_html=True)
|
| 620 |
+
|
| 621 |
+
# RENDER PROGRESS BAR OUTSIDE THE BOX
|
| 622 |
+
progress_html = f"""
|
| 623 |
+
<div class="progress-container">
|
| 624 |
+
<div class="progress-bar">
|
| 625 |
+
<div class="progress-fill" style="width: {progress_percent}%"></div>
|
| 626 |
+
</div>
|
| 627 |
+
<div class="slide-counter-badge">{slide_index + 1} / {total_slides}</div>
|
| 628 |
+
</div>
|
| 629 |
+
"""
|
| 630 |
+
st.markdown(progress_html, unsafe_allow_html=True)
|
| 631 |
+
|
| 632 |
+
# Navigation buttons below
|
| 633 |
+
st.markdown('<br>', unsafe_allow_html=True)
|
| 634 |
+
col_left, col_center, col_right = st.columns([1, 8, 1])
|
| 635 |
+
|
| 636 |
+
with col_left:
|
| 637 |
+
if slide_index > 0:
|
| 638 |
+
if st.button("β¬
", key="prev_btn", help="Previous slide", use_container_width=True):
|
| 639 |
+
st.session_state.current_slide -= 1
|
| 640 |
+
st.rerun()
|
| 641 |
+
|
| 642 |
+
with col_right:
|
| 643 |
+
if slide_index < total_slides - 1:
|
| 644 |
+
if st.button("β‘", key="next_btn", help="Next slide", use_container_width=True):
|
| 645 |
+
st.session_state.current_slide += 1
|
| 646 |
+
st.rerun()
|
| 647 |
+
|
| 648 |
+
|
| 649 |
+
# ============================================================
|
| 650 |
+
# PAGE LAYOUT
|
| 651 |
+
# ============================================================
|
| 652 |
+
|
| 653 |
+
|
| 654 |
+
# Header
|
| 655 |
+
st.markdown(
|
| 656 |
+
'<div class="header-container"><h1>π LearnOnTheGo</h1></div>',
|
| 657 |
+
unsafe_allow_html=True
|
| 658 |
+
)
|
| 659 |
+
|
| 660 |
+
# Search container
|
| 661 |
+
st.markdown('<div class="search-container">', unsafe_allow_html=True)
|
| 662 |
+
|
| 663 |
+
col1, col2 = st.columns([3, 1])
|
| 664 |
+
|
| 665 |
+
with col1:
|
| 666 |
+
search_query = st.text_input(
|
| 667 |
+
"Search",
|
| 668 |
+
value=st.session_state.search_query,
|
| 669 |
+
placeholder="e.g., Python, Machine Learning, Cloud Computing...",
|
| 670 |
+
key="search_input",
|
| 671 |
+
label_visibility="collapsed"
|
| 672 |
+
)
|
| 673 |
+
st.session_state.search_query = search_query
|
| 674 |
+
|
| 675 |
+
with col2:
|
| 676 |
+
mode = st.radio(
|
| 677 |
+
"Mode",
|
| 678 |
+
options=["Technical", "Operational"],
|
| 679 |
+
index=0 if st.session_state.mode == "technical" else 1,
|
| 680 |
+
key="mode_radio",
|
| 681 |
+
horizontal=True,
|
| 682 |
+
label_visibility="collapsed"
|
| 683 |
+
)
|
| 684 |
+
st.session_state.mode = mode.lower()
|
| 685 |
+
|
| 686 |
+
st.markdown('</div>', unsafe_allow_html=True)
|
| 687 |
+
|
| 688 |
+
# Generate button
|
| 689 |
+
col1, col2, col3 = st.columns([1, 2, 1])
|
| 690 |
+
with col2:
|
| 691 |
+
search_button = st.button("π Generate Slides", key="search_btn", use_container_width=True)
|
| 692 |
+
|
| 693 |
+
# Error handling
|
| 694 |
+
if st.session_state.error_message:
|
| 695 |
+
st.error(st.session_state.error_message)
|
| 696 |
+
st.session_state.error_message = None
|
| 697 |
+
|
| 698 |
+
# Execute pipeline
|
| 699 |
+
if search_button and st.session_state.search_query:
|
| 700 |
+
success, message = run_pipeline(st.session_state.search_query, st.session_state.mode)
|
| 701 |
+
if success:
|
| 702 |
+
st.success(message)
|
| 703 |
+
else:
|
| 704 |
+
st.error(message)
|
| 705 |
+
|
| 706 |
+
# Display slides
|
| 707 |
+
if st.session_state.slides_data:
|
| 708 |
+
st.markdown("---")
|
| 709 |
+
if st.session_state.current_slide >= len(st.session_state.slides_data.get('content', [])):
|
| 710 |
+
st.session_state.current_slide = 0
|
| 711 |
+
display_slide(st.session_state.current_slide)
|
| 712 |
+
else:
|
| 713 |
+
st.info("π Enter a topic and click 'Generate Slides' to get started!")
|
| 714 |
+
|
| 715 |
+
# Footer
|
| 716 |
+
st.markdown(
|
| 717 |
+
"""<div class="footer-bar">
|
| 718 |
+
<p><strong>LearnOnTheGo</strong> β’ Powered by AI β’ Built with Streamlit</p>
|
| 719 |
+
<p>5-Stage Pipeline: Generate β Correct β Validate β Refine β Generate Images</p>
|
| 720 |
+
<p>Gemini 2.5 Flash (text) β’ Gemini 2.5 Flash Image (images) β’ Perplexity Sonar Pro</p>
|
| 721 |
+
</div>""",
|
| 722 |
+
unsafe_allow_html=True
|
| 723 |
+
)
|
| 724 |
+
|
| 725 |
+
print("β
LearnOnTheGo - Progress bar moved outside box - Fixed!")
|
src/utils_functions.py
ADDED
|
@@ -0,0 +1,346 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import json
|
| 3 |
+
import pickle
|
| 4 |
+
import hashlib
|
| 5 |
+
import httpx
|
| 6 |
+
from datetime import datetime, timezone
|
| 7 |
+
from pymongo import MongoClient
|
| 8 |
+
from tenacity import retry, stop_after_attempt, wait_exponential
|
| 9 |
+
from dotenv import load_dotenv
|
| 10 |
+
|
| 11 |
+
load_dotenv()
|
| 12 |
+
|
| 13 |
+
# ============================================================
|
| 14 |
+
# PIPELINE METRICS CLASS (Complete tracking system)
|
| 15 |
+
# ============================================================
|
| 16 |
+
|
| 17 |
+
class PipelineMetrics:
|
| 18 |
+
"""
|
| 19 |
+
Complete metrics tracking for pipeline execution.
|
| 20 |
+
Tracks timing, stages, cache hits, and saves to MongoDB.
|
| 21 |
+
"""
|
| 22 |
+
|
| 23 |
+
def __init__(self, topic, mode):
|
| 24 |
+
"""Initialize metrics tracker"""
|
| 25 |
+
self.topic = topic
|
| 26 |
+
self.mode = mode
|
| 27 |
+
self.run_id = f"{mode}_{int(datetime.now().timestamp())}"
|
| 28 |
+
self.start_time = datetime.now(timezone.utc)
|
| 29 |
+
self.stages = {}
|
| 30 |
+
self.current_stage = None
|
| 31 |
+
self.current_stage_start = None
|
| 32 |
+
self.cache_hit = False
|
| 33 |
+
self.cache_type = None
|
| 34 |
+
self.error_occurred = False
|
| 35 |
+
self.error_message = None
|
| 36 |
+
|
| 37 |
+
def start_stage(self, stage_name):
|
| 38 |
+
"""Start tracking a stage"""
|
| 39 |
+
self.current_stage = stage_name
|
| 40 |
+
self.current_stage_start = datetime.now(timezone.utc)
|
| 41 |
+
print(f" π [METRICS] Starting: {stage_name}")
|
| 42 |
+
|
| 43 |
+
def end_stage(self, stage_name, output_summary=None):
|
| 44 |
+
"""End tracking a stage"""
|
| 45 |
+
if self.current_stage_start:
|
| 46 |
+
duration = (datetime.now(timezone.utc) - self.current_stage_start).total_seconds()
|
| 47 |
+
self.stages[stage_name] = {
|
| 48 |
+
"duration_seconds": duration,
|
| 49 |
+
"timestamp": datetime.now(timezone.utc),
|
| 50 |
+
"output_summary": output_summary
|
| 51 |
+
}
|
| 52 |
+
print(f" β Stage '{stage_name}' completed in {duration:.2f}s")
|
| 53 |
+
|
| 54 |
+
def set_cache_hit(self, cache_type="mongodb"):
|
| 55 |
+
"""Record cache hit"""
|
| 56 |
+
self.cache_hit = True
|
| 57 |
+
self.cache_type = cache_type
|
| 58 |
+
print(f" πΎ Cache hit: {cache_type}")
|
| 59 |
+
|
| 60 |
+
def set_error(self, error_message):
|
| 61 |
+
"""Record error"""
|
| 62 |
+
self.error_occurred = True
|
| 63 |
+
self.error_message = error_message
|
| 64 |
+
print(f" β Error: {error_message}")
|
| 65 |
+
|
| 66 |
+
def end(self):
|
| 67 |
+
"""End pipeline tracking"""
|
| 68 |
+
total_duration = (datetime.now(timezone.utc) - self.start_time).total_seconds()
|
| 69 |
+
self.metrics = {
|
| 70 |
+
"run_id": self.run_id,
|
| 71 |
+
"topic": self.topic,
|
| 72 |
+
"mode": self.mode,
|
| 73 |
+
"started_at": self.start_time,
|
| 74 |
+
"completed_at": datetime.now(timezone.utc),
|
| 75 |
+
"total_duration_seconds": total_duration,
|
| 76 |
+
"stages": self.stages,
|
| 77 |
+
"cache_hit": self.cache_hit,
|
| 78 |
+
"cache_type": self.cache_type,
|
| 79 |
+
"error_occurred": self.error_occurred,
|
| 80 |
+
"error_message": self.error_message
|
| 81 |
+
}
|
| 82 |
+
print(f"\n π Pipeline Complete: {total_duration:.2f}s total")
|
| 83 |
+
return self.metrics
|
| 84 |
+
|
| 85 |
+
def save_metrics(self):
|
| 86 |
+
"""Save metrics to MongoDB"""
|
| 87 |
+
try:
|
| 88 |
+
mongo_uri = os.getenv("MONGO_URI")
|
| 89 |
+
if not mongo_uri:
|
| 90 |
+
print(" β οΈ MONGO_URI not set - skipping metrics save")
|
| 91 |
+
return False
|
| 92 |
+
|
| 93 |
+
client = MongoClient(mongo_uri, serverSelectionTimeoutMS=5000)
|
| 94 |
+
db = client["learnToGo"]
|
| 95 |
+
|
| 96 |
+
# Collections based on mode
|
| 97 |
+
if self.mode == "technical":
|
| 98 |
+
metrics_col = db["pipelinemetrics"]
|
| 99 |
+
stages_col = db["stageoutputs"]
|
| 100 |
+
else:
|
| 101 |
+
metrics_col = db["operational_pipeline_metrics"]
|
| 102 |
+
stages_col = db["operational_stage_outputs"]
|
| 103 |
+
|
| 104 |
+
# Save metrics
|
| 105 |
+
metrics_col.insert_one(self.metrics)
|
| 106 |
+
|
| 107 |
+
# Save stage details
|
| 108 |
+
for stage_name, stage_data in self.stages.items():
|
| 109 |
+
stage_doc = {
|
| 110 |
+
"run_id": self.run_id,
|
| 111 |
+
"topic": self.topic,
|
| 112 |
+
"mode": self.mode,
|
| 113 |
+
"stage_name": stage_name,
|
| 114 |
+
"stage_data": stage_data
|
| 115 |
+
}
|
| 116 |
+
stages_col.insert_one(stage_doc)
|
| 117 |
+
|
| 118 |
+
print(f" β Metrics saved to MongoDB")
|
| 119 |
+
return True
|
| 120 |
+
|
| 121 |
+
except Exception as e:
|
| 122 |
+
print(f" β οΈ Could not save metrics: {e}")
|
| 123 |
+
return False
|
| 124 |
+
|
| 125 |
+
# ============================================================
|
| 126 |
+
# MONGODB CONNECTION & COLLECTIONS
|
| 127 |
+
# ============================================================
|
| 128 |
+
|
| 129 |
+
@retry(
|
| 130 |
+
stop=stop_after_attempt(3),
|
| 131 |
+
wait=wait_exponential(multiplier=1, min=2, max=10)
|
| 132 |
+
)
|
| 133 |
+
def get_mongo_client():
|
| 134 |
+
"""Get MongoDB client from environment variables"""
|
| 135 |
+
mongo_uri = os.getenv("MONGO_URI")
|
| 136 |
+
if not mongo_uri:
|
| 137 |
+
raise ValueError("MONGO_URI not set in .env")
|
| 138 |
+
return MongoClient(mongo_uri, serverSelectionTimeoutMS=5000)
|
| 139 |
+
|
| 140 |
+
def get_collections():
|
| 141 |
+
"""Get MongoDB collections for Technical and Operational keywords"""
|
| 142 |
+
client = get_mongo_client()
|
| 143 |
+
db = client["learnToGo"]
|
| 144 |
+
|
| 145 |
+
technical_collection = db["Keywords"]
|
| 146 |
+
operational_collection = db["OperationalKeywords"]
|
| 147 |
+
|
| 148 |
+
# Create indexes
|
| 149 |
+
technical_collection.create_index("aliases")
|
| 150 |
+
operational_collection.create_index("aliases")
|
| 151 |
+
|
| 152 |
+
return technical_collection, operational_collection, db
|
| 153 |
+
|
| 154 |
+
# ============================================================
|
| 155 |
+
# URL CACHING (Pickle-based - FIXED with proper dict structure)
|
| 156 |
+
# ============================================================
|
| 157 |
+
|
| 158 |
+
URL_CACHE_FILE = "/tmp/url_validation_cache.pkl"
|
| 159 |
+
|
| 160 |
+
def load_url_cache():
|
| 161 |
+
"""Load URL validation cache from pickle file"""
|
| 162 |
+
try:
|
| 163 |
+
if os.path.exists(URL_CACHE_FILE):
|
| 164 |
+
with open(URL_CACHE_FILE, 'rb') as f:
|
| 165 |
+
cache = pickle.load(f)
|
| 166 |
+
print(f"β Loaded URL cache with {len(cache)} entries")
|
| 167 |
+
return cache
|
| 168 |
+
except Exception as e:
|
| 169 |
+
print(f"β οΈ Could not load URL cache: {e}")
|
| 170 |
+
return {}
|
| 171 |
+
|
| 172 |
+
def save_url_cache(cache):
|
| 173 |
+
"""Save URL validation cache to pickle file"""
|
| 174 |
+
try:
|
| 175 |
+
with open(URL_CACHE_FILE, 'wb') as f:
|
| 176 |
+
pickle.dump(cache, f)
|
| 177 |
+
print(f"β Saved URL cache with {len(cache)} entries")
|
| 178 |
+
return True
|
| 179 |
+
except Exception as e:
|
| 180 |
+
print(f"β οΈ Could not save URL cache: {e}")
|
| 181 |
+
return False
|
| 182 |
+
|
| 183 |
+
def get_url_hash(url):
|
| 184 |
+
"""Generate MD5 hash for URL as cache key"""
|
| 185 |
+
return hashlib.md5(url.encode()).hexdigest()
|
| 186 |
+
|
| 187 |
+
@retry(
|
| 188 |
+
stop=stop_after_attempt(2),
|
| 189 |
+
wait=wait_exponential(multiplier=1, min=2, max=5)
|
| 190 |
+
)
|
| 191 |
+
def validate_url_cached(url, timeout=5):
|
| 192 |
+
"""Check if URL is valid with cache check - FIXED to return dict"""
|
| 193 |
+
url_hash = get_url_hash(url)
|
| 194 |
+
|
| 195 |
+
# Load cache
|
| 196 |
+
url_cache = load_url_cache()
|
| 197 |
+
|
| 198 |
+
# Check cache
|
| 199 |
+
if url_hash in url_cache:
|
| 200 |
+
print(f" πΎ URL cache hit: {url[:50]}...")
|
| 201 |
+
return url_cache[url_hash]['valid'] # β Returns boolean from dict
|
| 202 |
+
|
| 203 |
+
# Validate URL
|
| 204 |
+
try:
|
| 205 |
+
response = httpx.head(url, timeout=timeout, follow_redirects=True)
|
| 206 |
+
is_valid = response.status_code in [200, 301, 302, 303, 307, 308]
|
| 207 |
+
except:
|
| 208 |
+
try:
|
| 209 |
+
response = httpx.get(url, timeout=timeout, follow_redirects=True)
|
| 210 |
+
is_valid = response.status_code == 200
|
| 211 |
+
except:
|
| 212 |
+
is_valid = False
|
| 213 |
+
|
| 214 |
+
# Save to cache as DICT with valid, checked_at, url
|
| 215 |
+
url_cache[url_hash] = {
|
| 216 |
+
'valid': is_valid,
|
| 217 |
+
'checked_at': datetime.now(timezone.utc).isoformat(),
|
| 218 |
+
'url': url
|
| 219 |
+
}
|
| 220 |
+
save_url_cache(url_cache)
|
| 221 |
+
|
| 222 |
+
print(f" β URL validated: {url[:50]}... = {is_valid}")
|
| 223 |
+
return is_valid
|
| 224 |
+
|
| 225 |
+
# ============================================================
|
| 226 |
+
# CACHE OPERATIONS
|
| 227 |
+
# ============================================================
|
| 228 |
+
|
| 229 |
+
@retry(
|
| 230 |
+
stop=stop_after_attempt(3),
|
| 231 |
+
wait=wait_exponential(multiplier=1, min=2, max=10)
|
| 232 |
+
)
|
| 233 |
+
def check_cache(topic, collection):
|
| 234 |
+
"""
|
| 235 |
+
Check MongoDB cache using normalized keyword - NO LLM call!
|
| 236 |
+
Includes retry logic for connection failures.
|
| 237 |
+
"""
|
| 238 |
+
try:
|
| 239 |
+
normalized = topic.lower().strip()
|
| 240 |
+
print(f"π Checking cache for: {normalized}")
|
| 241 |
+
|
| 242 |
+
cached = collection.find_one({"aliases": normalized})
|
| 243 |
+
|
| 244 |
+
if cached:
|
| 245 |
+
print(f"β
CACHE HIT! Found topic: {cached['topic']}")
|
| 246 |
+
return cached['content'], True
|
| 247 |
+
else:
|
| 248 |
+
print(f"β CACHE MISS - Will run full pipeline")
|
| 249 |
+
return None, False
|
| 250 |
+
|
| 251 |
+
except Exception as e:
|
| 252 |
+
print(f"β Cache lookup error: {e}")
|
| 253 |
+
raise
|
| 254 |
+
|
| 255 |
+
@retry(
|
| 256 |
+
stop=stop_after_attempt(3),
|
| 257 |
+
wait=wait_exponential(multiplier=1, min=2, max=10)
|
| 258 |
+
)
|
| 259 |
+
def save_to_cache(topic, content, collection):
|
| 260 |
+
"""
|
| 261 |
+
Save generated slides to MongoDB.
|
| 262 |
+
Includes retry logic for connection failures.
|
| 263 |
+
"""
|
| 264 |
+
try:
|
| 265 |
+
aliases = content.get('aliases', [topic.lower().strip()])
|
| 266 |
+
|
| 267 |
+
document = {
|
| 268 |
+
"topic": content.get('topic', topic),
|
| 269 |
+
"aliases": aliases,
|
| 270 |
+
"createdAt": datetime.now(timezone.utc),
|
| 271 |
+
"content": content
|
| 272 |
+
}
|
| 273 |
+
|
| 274 |
+
result = collection.insert_one(document)
|
| 275 |
+
print(f"β
Saved to MongoDB - Document ID: {result.inserted_id}")
|
| 276 |
+
return result.inserted_id
|
| 277 |
+
|
| 278 |
+
except Exception as e:
|
| 279 |
+
print(f"β Cache save error: {e}")
|
| 280 |
+
raise
|
| 281 |
+
|
| 282 |
+
# ============================================================
|
| 283 |
+
# URL VALIDATION & SELECTION
|
| 284 |
+
# ============================================================
|
| 285 |
+
|
| 286 |
+
def validate_and_select_urls(corrected_json):
|
| 287 |
+
"""
|
| 288 |
+
Validate ALL URLs and select best ones.
|
| 289 |
+
Uses cached validation to avoid repeated HTTP requests.
|
| 290 |
+
"""
|
| 291 |
+
urls = corrected_json.get("urls", [])
|
| 292 |
+
print(f"Validating {len(urls)} URLs with caching...")
|
| 293 |
+
|
| 294 |
+
valid_urls = []
|
| 295 |
+
validation_results = []
|
| 296 |
+
|
| 297 |
+
for url_obj in urls:
|
| 298 |
+
url = url_obj.get("url")
|
| 299 |
+
if url:
|
| 300 |
+
is_valid = validate_url_cached(url)
|
| 301 |
+
|
| 302 |
+
validation_results.append({
|
| 303 |
+
"url": url,
|
| 304 |
+
"title": url_obj.get("title"),
|
| 305 |
+
"valid": is_valid
|
| 306 |
+
})
|
| 307 |
+
|
| 308 |
+
if is_valid:
|
| 309 |
+
valid_urls.append(url_obj)
|
| 310 |
+
|
| 311 |
+
# Keep only best 5 URLs
|
| 312 |
+
valid_urls = valid_urls[:5]
|
| 313 |
+
|
| 314 |
+
print(f"β Kept {len(valid_urls)} valid URLs")
|
| 315 |
+
|
| 316 |
+
corrected_json["urls"] = valid_urls
|
| 317 |
+
return corrected_json, validation_results
|
| 318 |
+
|
| 319 |
+
# ============================================================
|
| 320 |
+
# INPUT VALIDATION (50 char limit for both technical and operational)
|
| 321 |
+
# ============================================================
|
| 322 |
+
|
| 323 |
+
@retry(
|
| 324 |
+
stop=stop_after_attempt(3),
|
| 325 |
+
wait=wait_exponential(multiplier=1, min=1, max=3)
|
| 326 |
+
)
|
| 327 |
+
def validate_and_sanitize_topic(topic):
|
| 328 |
+
"""
|
| 329 |
+
Validate and sanitize user input before pipeline.
|
| 330 |
+
Prevents errors and invalid topics.
|
| 331 |
+
FIXED: Both technical and operational now have 50 char limit
|
| 332 |
+
"""
|
| 333 |
+
if not topic or not topic.strip():
|
| 334 |
+
raise ValueError("β Topic cannot be empty.")
|
| 335 |
+
|
| 336 |
+
topic = topic.strip()
|
| 337 |
+
|
| 338 |
+
if len(topic) < 1:
|
| 339 |
+
raise ValueError("β Topic must be at least 1 character long.")
|
| 340 |
+
if len(topic) > 50:
|
| 341 |
+
raise ValueError("β Topic cannot exceed 50 characters.")
|
| 342 |
+
|
| 343 |
+
print(f"β
Input validated: '{topic}'")
|
| 344 |
+
return topic
|
| 345 |
+
|
| 346 |
+
print("β All utility functions ready with metrics, URL caching, and retry logic")
|