Spaces:

tokyotechlab
/

geolocation

Sleeping

App Files Files Community

3v324v23 commited on Dec 4, 2025

Commit

eff2be4

0 Parent(s):

init prj

Browse files

Files changed (38) hide show

.gitignore +18 -0
Dockerfile +19 -0
LICENSE +201 -0
README.md +166 -0
app.py +141 -0
docker-compose.yml +19 -0
entrypoint.sh +24 -0
openapi.json +232 -0
requirements.txt +434 -0
src/data_processor.py +488 -0
src/g3/G3.py +134 -0
src/g3/dataset.py +407 -0
src/g3/hparams.yaml +57 -0
src/g3/locationencoder.py +133 -0
src/g3/nn/mlp.py +20 -0
src/g3/nn/rff_mlp.py +38 -0
src/g3/nn/siren.py +100 -0
src/g3/pe/projection.py +54 -0
src/g3/pe/projection_rff.py +65 -0
src/g3/pe/spherical_harmonics.py +40 -0
src/g3/pe/spherical_harmonics_closed_form.py +40 -0
src/g3/pe/spherical_harmonics_generate_ylms.py +73 -0
src/g3/pe/spherical_harmonics_ylm.py +0 -0
src/g3/rff/functional.py +77 -0
src/g3/rff/layers.py +86 -0
src/g3_batch_prediction.py +568 -0
src/prompt/__init__.py +6 -0
src/prompt/factory.py +418 -0
src/prompt/fetch/content_fetch.py +171 -0
src/prompt/fetch/satellite_fetch.py +87 -0
src/prompt/preprocess/keyframe_extract.py +195 -0
src/prompt/preprocess/video_transcribe.py +132 -0
src/prompt/search/image_search.py +527 -0
src/prompt/search/index_search.py +271 -0
src/prompt/search/text_search.py +271 -0
src/prompt/template.py +107 -0
src/setup.py +39 -0
src/utils.py +312 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,18 @@

+# Large files, to be downloaded via huggingface.
+g3/index/G3.index
+g3/checkpoints/mercator_finetune_weight.pth
+g3/data/mp16/MP16_Pro_filtered.csv
+index
+checkpoints
+data
+# venv and dev stuff
+linuxenv
+myenv
+.venv
+.env
+acmmm2025-grand-challenge-gg-credentials.json
+cred.json
+**/__pycache__/
+pyproject.toml
+uv.lock

Dockerfile ADDED Viewed

	@@ -0,0 +1,19 @@

+FROM python:3.12-slim-bullseye
+WORKDIR /code
+RUN apt-get update && apt-get install -y ffmpeg xvfb
+COPY ./requirements.txt /code/requirements.txt
+RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
+RUN playwright install chrome
+RUN playwright install-deps
+RUN playwright install
+COPY ./src /code/src
+RUN python /code/src/setup.py
+COPY ./app.py /code/app.py
+COPY ./entrypoint.sh /code/entrypoint.sh
+RUN chmod +x /code/entrypoint.sh
+ENTRYPOINT [ "/code/entrypoint.sh" ]

LICENSE ADDED Viewed

	@@ -0,0 +1,201 @@

+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright [yyyy] [name of copyright owner]
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

README.md ADDED Viewed

	@@ -0,0 +1,166 @@

+# G3 Geolocation Service
+This is a containerized geolocation service based on the paper "G3: An Effective and Adaptive Framework for Worldwide Geolocalization Using Large Multi-Modality Models". The service is augmented with multilayer verification for location and evidence.
+## Prerequisites
+- Docker with GPU support
+- NVIDIA Container Toolkit (for GPU access)
+- Required API keys (see Environment Variables section)
+## Quick Start
+### 1. Prepare Environment File
+Create a `.env` file with the following variables:
+```bash
+GOOGLE_CLOUD_API_KEY=your_google_cloud_api_key
+GOOGLE_CSE_CX=your_google_custom_search_engine_id
+SCRAPINGDOG_API_KEY=your_scrapingdog_api_key
+IMGBB_API_KEY=your_imgbb_api_key
+GOOGLE_APPLICATION_CREDENTIALS=/code/path/to/your/credentials.json
+```
+### 2. Prepare Google Cloud Credentials
+Ensure you have a Google Cloud service account JSON credentials file ready for copying to the container.
+### 3. Build Docker Image
+```bash
+docker build -t g3-geolocation .
+```
+### 4. Create Docker Container
+```bash
+docker create --name g3-container -p 80:80 --gpus=all --env-file .env g3-geolocation
+```
+### 5. Copy Credentials to Container
+```bash
+docker cp /path/to/your/credentials.json g3-container:/code/
+```
+### 6. Start Container
+```bash
+docker start g3-container
+```
+## Usage
+Once the container is running, the service will be available at `http://localhost:80`.
+### API Endpoints
+- **POST** `/g3/predict` - Submit images/videos for geolocation prediction
+- **GET** `/g3/openapi` - Get OpenAPI specification
+### Example Request
+```bash
+curl -X POST "http://localhost:80/g3/predict" \
+  -H "Content-Type: multipart/form-data" \
+  -F "files=@your_image.jpg"
+```
+## Environment Variables
+| Variable                         | Description                                | Required |
+| -------------------------------- | ------------------------------------------ | -------- |
+| `GOOGLE_CLOUD_API_KEY`           | Google Cloud API key for Gemini and Custom Google Search API        | Yes      |
+| `GOOGLE_CSE_CX`                  | Google Custom Search Engine ID             | Yes      |
+| `SCRAPINGDOG_API_KEY`            | ScrapingDog API key for web scraping       | Yes      |
+| `IMGBB_API_KEY`                  | ImgBB API key for image hosting            | Yes      |
+| `GOOGLE_APPLICATION_CREDENTIALS` | Path to Google Cloud credentials JSON file | Yes      |
+## API Keys Setup
+### Google Cloud API Key
+1. Go to [Google Cloud Console](https://console.cloud.google.com/)
+2. Enable Gemini API and Vision API
+3. Create an API key in the Credentials section
+### Google Custom Search Engine
+1. Go to [Google Custom Search](https://cse.google.com/)
+2. Create a new search engine
+3. Copy the Search Engine ID (CX)
+### ScrapingDog API Key
+1. Sign up at [ScrapingDog](https://scrapingdog.com/)
+2. Get your API key from the dashboard
+### ImgBB API Key
+1. Sign up at [ImgBB](https://imgbb.com/)
+2. Get your API key from the API section
+## Container Management
+### View Logs
+```bash
+docker logs g3-container
+```
+### Stop Container
+```bash
+docker stop g3-container
+```
+### Remove Container
+```bash
+docker rm g3-container
+```
+### Remove Image
+```bash
+docker rmi g3-geolocation
+```
+## Troubleshooting
+### GPU Access Issues
+Ensure NVIDIA Container Toolkit is properly installed:
+```bash
+nvidia-smi
+docker run --rm --gpus all nvidia/cuda:11.0-base-ubuntu20.04 nvidia-smi
+```
+### API Key Issues
+- Verify all API keys are valid and have proper permissions
+- Check that the credentials file is properly copied to the container
+- Ensure the `GOOGLE_APPLICATION_CREDENTIALS` path matches the copied file location
+### Memory Issues
+If you encounter out-of-memory errors, consider:
+- Reducing image sizes before upload
+- Using a machine with more RAM/VRAM
+- Adjusting batch processing parameters
+## Citation
+```bib
+@article{jia2024g3,
+  title={G3: an effective and adaptive framework for worldwide geolocalization using large multi-modality models},
+  author={Jia, Pengyue and Liu, Yiding and Li, Xiaopeng and Zhao, Xiangyu and Wang, Yuhao and Du, Yantong and Han, Xiao and Wei, Xuetao and Wang, Shuaiqiang and Yin, Dawei},
+  journal={Advances in Neural Information Processing Systems},
+  volume={37},
+  pages={53198--53221},
+  year={2024}
+}
+```

app.py ADDED Viewed

	@@ -0,0 +1,141 @@

+import json
+import os
+import shutil
+import uuid
+from contextlib import asynccontextmanager
+from typing import Annotated, Optional
+import torch
+from dotenv import load_dotenv
+from fastapi import FastAPI, File, HTTPException, UploadFile, status
+from pydantic import BaseModel, Field
+from src.g3_batch_prediction import G3BatchPredictor
+from src.utils import load_images_as_base64
+class EvidenceResponse(BaseModel):
+    analysis: Annotated[
+        str,
+        Field(description="A supporting analysis for the prediction."),
+    ]
+    references: Annotated[
+        list[str],
+        Field(description="Links or base64-encoded JPEG supporting the analysis."),
+    ] = []
+class LocationPredictionResponse(BaseModel):
+    latitude: Annotated[
+        float,
+        Field(description="Latitude of the predicted location, in degree."),
+    ]
+    longitude: Annotated[
+        float,
+        Field(description="Longitude of the predicted location, in degree."),
+    ]
+    location: Annotated[
+        str,
+        Field(description="Textual description of the predicted location."),
+    ]
+    evidence: Annotated[
+        list[EvidenceResponse],
+        Field(description="List of supporting analyses for the prediction."),
+    ]
+class PredictionResponse(BaseModel):
+    prediction: Annotated[
+        LocationPredictionResponse,
+        Field(description="The location prediction and accompanying analysis."),
+    ]
+    transcript: Annotated[
+        str | None,
+        Field(description="The extracted and concatenated transcripts, if any."),
+    ] = None
+    media: Optional[list[str]] = Field(
+        default=None,
+        description="List of media files processed during prediction."
+    )
+predictor: G3BatchPredictor
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+    load_dotenv()
+    with open("openapi.json", "wt") as api_file:
+        json.dump(app.openapi(), api_file, indent=4)
+    global predictor
+    predictor = G3BatchPredictor(device="cuda" if torch.cuda.is_available() else "cpu")
+    yield
+    del predictor
+app = FastAPI(
+    lifespan=lifespan,
+    title="G3",
+    description="An endpoint to predict GPS coordinate from static image,"
+    " using G3 Framework.",
+)
+@app.post(
+    "/g3/predict",
+    description="Provide location prediction.",
+)
+async def predict_endpoint(
+    files: Annotated[
+        list[UploadFile],
+        File(description="Input images, videos and metadata json."),
+    ],
+) -> PredictionResponse:
+    # Write files to disk
+    try:
+        predictor.clear_directories()
+        for file in files:
+            filename = file.filename if file.filename is not None else uuid.uuid4().hex
+            filepath = predictor.input_dir / filename
+            os.makedirs(predictor.input_dir, exist_ok=True)
+            with open(filepath, "wb") as buffer:
+                shutil.copyfileobj(file.file, buffer)
+    except Exception as e:
+        raise HTTPException(
+            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
+            detail=f"Failed to save file: {e}",
+        )
+    # Get prediction
+    response = await predictor.predict(model_name="gemini-2.5-pro")
+    # response = predictor.get_response(response)
+    prediction = LocationPredictionResponse(
+        latitude=response.latitude,
+        longitude=response.longitude,
+        location=response.location,
+        evidence=[
+            EvidenceResponse(analysis=ev.analysis, references=ev.references)
+            for ev in response.evidence
+        ],
+    )
+    # Get transcript if available
+    transcript = predictor.get_transcript()
+    # Get media files if available
+    images_b64 = load_images_as_base64()
+    # Clear directories
+    return PredictionResponse(prediction=prediction, transcript=transcript, media=images_b64)
+@app.get(
+    "/g3/openapi",
+    description="Provide the OpenAPI JSON describing this service's endpoints.",
+)
+async def openapi():
+    return app.openapi()

docker-compose.yml ADDED Viewed

	@@ -0,0 +1,19 @@

+services:
+  web:
+    build: .
+    ports:
+      - "8000:80"
+    environment:
+      - GOOGLE_APPLICATION_CREDENTIALS=/code/keys/credentials.json
+    volumes:
+      - ./.env:/code/.env
+      - ./keys:/code/keys
+      - ./entrypoint.sh:/code/entrypoint.sh
+    env_file:
+      - ./.env
+    restart: unless-stopped
+    runtime: nvidia

entrypoint.sh ADDED Viewed

	@@ -0,0 +1,24 @@

+#!/usr/bin/env bash
+set -euo pipefail
+# --- cleanup any stale Xvfb lock/socket ---
+if [ -e /tmp/.X99-lock ]; then
+  echo "[entrypoint] removing stale /tmp/.X99-lock" >&2
+  rm -f /tmp/.X99-lock
+fi
+if [ -e /tmp/.X11-unix/X99 ]; then
+  echo "[entrypoint] removing stale /tmp/.X11-unix/X99" >&2
+  rm -f /tmp/.X11-unix/X99
+fi
+# --- start the virtual display ---
+echo "[entrypoint] starting Xvfb on :99" >&2
+Xvfb :99 -screen 0 1920x1080x24 &
+# --- point GUI apps at it ---
+export DISPLAY=:99
+echo "[entrypoint] DISPLAY set to $DISPLAY" >&2
+# --- launch FastAPI ---
+echo "[entrypoint] exec fastapi" >&2
+exec fastapi run app.py --port 80

openapi.json ADDED Viewed

	@@ -0,0 +1,232 @@

+{
+    "openapi": "3.1.0",
+    "info": {
+        "title": "G3",
+        "description": "An endpoint to predict GPS coordinate from static image, using G3 Framework.",
+        "version": "0.1.0"
+    },
+    "paths": {
+        "/g3/predict": {
+            "post": {
+                "summary": "Predict Endpoint",
+                "description": "Provide location prediction.",
+                "operationId": "predict_endpoint_g3_predict_post",
+                "requestBody": {
+                    "content": {
+                        "multipart/form-data": {
+                            "schema": {
+                                "$ref": "#/components/schemas/Body_predict_endpoint_g3_predict_post"
+                            }
+                        }
+                    },
+                    "required": true
+                },
+                "responses": {
+                    "200": {
+                        "description": "Successful Response",
+                        "content": {
+                            "application/json": {
+                                "schema": {
+                                    "$ref": "#/components/schemas/PredictionResponse"
+                                }
+                            }
+                        }
+                    },
+                    "422": {
+                        "description": "Validation Error",
+                        "content": {
+                            "application/json": {
+                                "schema": {
+                                    "$ref": "#/components/schemas/HTTPValidationError"
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        },
+        "/g3/openapi": {
+            "get": {
+                "summary": "Openapi",
+                "description": "Provide the OpenAPI JSON describing this service's endpoints.",
+                "operationId": "openapi_g3_openapi_get",
+                "responses": {
+                    "200": {
+                        "description": "Successful Response",
+                        "content": {
+                            "application/json": {
+                                "schema": {}
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    },
+    "components": {
+        "schemas": {
+            "Body_predict_endpoint_g3_predict_post": {
+                "properties": {
+                    "files": {
+                        "items": {
+                            "type": "string",
+                            "format": "binary"
+                        },
+                        "type": "array",
+                        "title": "Files",
+                        "description": "Input images, videos and metadata json."
+                    }
+                },
+                "type": "object",
+                "required": [
+                    "files"
+                ],
+                "title": "Body_predict_endpoint_g3_predict_post"
+            },
+            "EvidenceResponse": {
+                "properties": {
+                    "analysis": {
+                        "type": "string",
+                        "title": "Analysis",
+                        "description": "A supporting analysis for the prediction."
+                    },
+                    "references": {
+                        "items": {
+                            "type": "string"
+                        },
+                        "type": "array",
+                        "title": "References",
+                        "description": "Links or base64-encoded JPEG supporting the analysis.",
+                        "default": []
+                    }
+                },
+                "type": "object",
+                "required": [
+                    "analysis"
+                ],
+                "title": "EvidenceResponse"
+            },
+            "HTTPValidationError": {
+                "properties": {
+                    "detail": {
+                        "items": {
+                            "$ref": "#/components/schemas/ValidationError"
+                        },
+                        "type": "array",
+                        "title": "Detail"
+                    }
+                },
+                "type": "object",
+                "title": "HTTPValidationError"
+            },
+            "LocationPredictionResponse": {
+                "properties": {
+                    "latitude": {
+                        "type": "number",
+                        "title": "Latitude",
+                        "description": "Latitude of the predicted location, in degree."
+                    },
+                    "longitude": {
+                        "type": "number",
+                        "title": "Longitude",
+                        "description": "Longitude of the predicted location, in degree."
+                    },
+                    "location": {
+                        "type": "string",
+                        "title": "Location",
+                        "description": "Textual description of the predicted location."
+                    },
+                    "evidence": {
+                        "items": {
+                            "$ref": "#/components/schemas/EvidenceResponse"
+                        },
+                        "type": "array",
+                        "title": "Evidence",
+                        "description": "List of supporting analyses for the prediction."
+                    }
+                },
+                "type": "object",
+                "required": [
+                    "latitude",
+                    "longitude",
+                    "location",
+                    "evidence"
+                ],
+                "title": "LocationPredictionResponse"
+            },
+            "PredictionResponse": {
+                "properties": {
+                    "prediction": {
+                        "$ref": "#/components/schemas/LocationPredictionResponse",
+                        "description": "The location prediction and accompanying analysis."
+                    },
+                    "transcript": {
+                        "anyOf": [
+                            {
+                                "type": "string"
+                            },
+                            {
+                                "type": "null"
+                            }
+                        ],
+                        "title": "Transcript",
+                        "description": "The extracted and concatenated transcripts, if any."
+                    },
+                    "media": {
+                        "anyOf": [
+                            {
+                                "items": {
+                                    "type": "string"
+                                },
+                                "type": "array"
+                            },
+                            {
+                                "type": "null"
+                            }
+                        ],
+                        "title": "Media",
+                        "description": "List of media files processed during prediction."
+                    }
+                },
+                "type": "object",
+                "required": [
+                    "prediction"
+                ],
+                "title": "PredictionResponse"
+            },
+            "ValidationError": {
+                "properties": {
+                    "loc": {
+                        "items": {
+                            "anyOf": [
+                                {
+                                    "type": "string"
+                                },
+                                {
+                                    "type": "integer"
+                                }
+                            ]
+                        },
+                        "type": "array",
+                        "title": "Location"
+                    },
+                    "msg": {
+                        "type": "string",
+                        "title": "Message"
+                    },
+                    "type": {
+                        "type": "string",
+                        "title": "Error Type"
+                    }
+                },
+                "type": "object",
+                "required": [
+                    "loc",
+                    "msg",
+                    "type"
+                ],
+                "title": "ValidationError"
+            }
+        }
+    }
+}

requirements.txt ADDED Viewed

	@@ -0,0 +1,434 @@

+# This file was autogenerated by uv via the following command:
+#    uv export --no-hashes --format requirements-txt
+annotated-types==0.7.0
+    # via pydantic
+anyio==4.9.0
+    # via
+    #   google-genai
+    #   httpx
+    #   starlette
+    #   watchfiles
+cachetools==5.5.2
+    # via google-auth
+certifi==2025.7.14
+    # via
+    #   httpcore
+    #   httpx
+    #   pyproj
+    #   requests
+    #   sentry-sdk
+charset-normalizer==3.4.2
+    # via requests
+click==8.2.1
+    # via
+    #   rich-toolkit
+    #   typer
+    #   uvicorn
+colorama==0.4.6 ; sys_platform == 'win32'
+    # via
+    #   click
+    #   tqdm
+    #   uvicorn
+decorator==5.2.1
+    # via moviepy
+dnspython==2.7.0
+    # via email-validator
+einops==0.8.1
+    # via acmmm25-grand-challenge-geolocation
+email-validator==2.2.0
+    # via
+    #   fastapi
+    #   pydantic
+faiss-gpu-cu12==1.11.0
+    # via acmmm25-grand-challenge-geolocation
+fastapi==0.116.1
+    # via acmmm25-grand-challenge-geolocation
+fastapi-cli==0.0.8
+    # via fastapi
+fastapi-cloud-cli==0.1.4
+    # via fastapi-cli
+ffmpy==0.6.0
+    # via katna
+filelock==3.18.0
+    # via
+    #   huggingface-hub
+    #   torch
+    #   transformers
+fsspec==2025.7.0
+    # via
+    #   huggingface-hub
+    #   torch
+ftfy==6.3.1
+    # via open-clip-torch
+geographiclib==2.0
+    # via geopy
+geopy==2.4.1
+    # via acmmm25-grand-challenge-geolocation
+google-api-core==2.25.1
+    # via
+    #   google-cloud-videointelligence
+    #   google-cloud-vision
+google-auth==2.40.3
+    # via
+    #   google-api-core
+    #   google-cloud-videointelligence
+    #   google-cloud-vision
+    #   google-genai
+google-cloud-videointelligence==2.16.2
+    # via acmmm25-grand-challenge-geolocation
+google-cloud-vision==3.10.2
+    # via acmmm25-grand-challenge-geolocation
+google-genai==1.26.0
+    # via acmmm25-grand-challenge-geolocation
+googleapis-common-protos==1.70.0
+    # via
+    #   google-api-core
+    #   grpcio-status
+greenlet==3.2.3
+    # via playwright
+grpcio==1.73.1
+    # via
+    #   google-api-core
+    #   grpcio-status
+grpcio-status==1.73.1
+    # via google-api-core
+h11==0.16.0
+    # via
+    #   httpcore
+    #   uvicorn
+hf-xet==1.1.5 ; platform_machine == 'aarch64' or platform_machine == 'amd64' or platform_machine == 'arm64' or platform_machine == 'x86_64'
+    # via huggingface-hub
+httpcore==1.0.9
+    # via httpx
+httptools==0.6.4
+    # via uvicorn
+httpx==0.28.1
+    # via
+    #   fastapi
+    #   fastapi-cloud-cli
+    #   google-genai
+huggingface-hub==0.33.4
+    # via
+    #   open-clip-torch
+    #   timm
+    #   tokenizers
+    #   transformers
+idna==3.10
+    # via
+    #   anyio
+    #   email-validator
+    #   httpx
+    #   requests
+imageio==2.37.0
+    # via
+    #   moviepy
+    #   scikit-image
+imageio-ffmpeg==0.6.0
+    # via
+    #   katna
+    #   moviepy
+imutils==0.5.4
+    # via katna
+jinja2==3.1.6
+    # via
+    #   fastapi
+    #   torch
+joblib==1.5.1
+    # via scikit-learn
+katna==0.9.2
+    # via acmmm25-grand-challenge-geolocation
+lazy-loader==0.4
+    # via scikit-image
+llvmlite==0.44.0
+    # via numba
+markdown-it-py==3.0.0
+    # via rich
+markupsafe==3.0.2
+    # via jinja2
+mdurl==0.1.2
+    # via markdown-it-py
+more-itertools==10.7.0
+    # via openai-whisper
+moviepy==2.2.1
+    # via acmmm25-grand-challenge-geolocation
+mpmath==1.3.0
+    # via sympy
+networkx==3.5
+    # via
+    #   scikit-image
+    #   torch
+numba==0.61.2
+    # via openai-whisper
+numpy==1.26.4
+    # via
+    #   faiss-gpu-cu12
+    #   imageio
+    #   katna
+    #   moviepy
+    #   numba
+    #   openai-whisper
+    #   opencv-contrib-python
+    #   opencv-python
+    #   pandas
+    #   scikit-image
+    #   scikit-learn
+    #   scipy
+    #   tifffile
+    #   torchvision
+    #   transformers
+nvidia-cublas-cu12==12.6.4.1
+    # via
+    #   faiss-gpu-cu12
+    #   nvidia-cudnn-cu12
+    #   nvidia-cusolver-cu12
+    #   torch
+nvidia-cuda-cupti-cu12==12.6.80 ; platform_machine == 'x86_64' and sys_platform == 'linux'
+    # via torch
+nvidia-cuda-nvrtc-cu12==12.6.77 ; platform_machine == 'x86_64' and sys_platform == 'linux'
+    # via torch
+nvidia-cuda-runtime-cu12==12.6.77
+    # via
+    #   faiss-gpu-cu12
+    #   torch
+nvidia-cudnn-cu12==9.5.1.17 ; platform_machine == 'x86_64' and sys_platform == 'linux'
+    # via torch
+nvidia-cufft-cu12==11.3.0.4 ; platform_machine == 'x86_64' and sys_platform == 'linux'
+    # via torch
+nvidia-cufile-cu12==1.11.1.6 ; platform_machine == 'x86_64' and sys_platform == 'linux'
+    # via torch
+nvidia-curand-cu12==10.3.7.77 ; platform_machine == 'x86_64' and sys_platform == 'linux'
+    # via torch
+nvidia-cusolver-cu12==11.7.1.2 ; platform_machine == 'x86_64' and sys_platform == 'linux'
+    # via torch
+nvidia-cusparse-cu12==12.5.4.2 ; platform_machine == 'x86_64' and sys_platform == 'linux'
+    # via
+    #   nvidia-cusolver-cu12
+    #   torch
+nvidia-cusparselt-cu12==0.6.3 ; platform_machine == 'x86_64' and sys_platform == 'linux'
+    # via torch
+nvidia-nccl-cu12==2.26.2 ; platform_machine == 'x86_64' and sys_platform == 'linux'
+    # via torch
+nvidia-nvjitlink-cu12==12.6.85 ; platform_machine == 'x86_64' and sys_platform == 'linux'
+    # via
+    #   nvidia-cufft-cu12
+    #   nvidia-cusolver-cu12
+    #   nvidia-cusparse-cu12
+    #   torch
+nvidia-nvtx-cu12==12.6.77 ; platform_machine == 'x86_64' and sys_platform == 'linux'
+    # via torch
+open-clip-torch==2.32.0
+    # via acmmm25-grand-challenge-geolocation
+openai-whisper==20250625
+    # via acmmm25-grand-challenge-geolocation
+opencv-contrib-python==4.11.0.86
+    # via katna
+opencv-python==4.11.0.86
+    # via acmmm25-grand-challenge-geolocation
+packaging==25.0
+    # via
+    #   faiss-gpu-cu12
+    #   huggingface-hub
+    #   lazy-loader
+    #   scikit-image
+    #   transformers
+pandas==2.3.1
+    # via acmmm25-grand-challenge-geolocation
+pillow==11.3.0
+    # via
+    #   acmmm25-grand-challenge-geolocation
+    #   imageio
+    #   moviepy
+    #   scikit-image
+    #   torchvision
+playwright==1.53.0
+    # via acmmm25-grand-challenge-geolocation
+proglog==0.1.12
+    # via moviepy
+proto-plus==1.26.1
+    # via
+    #   google-api-core
+    #   google-cloud-videointelligence
+    #   google-cloud-vision
+protobuf==6.31.1
+    # via
+    #   google-api-core
+    #   google-cloud-videointelligence
+    #   google-cloud-vision
+    #   googleapis-common-protos
+    #   grpcio-status
+    #   proto-plus
+psutil==7.0.0
+    # via katna
+pyasn1==0.6.1
+    # via
+    #   pyasn1-modules
+    #   rsa
+pyasn1-modules==0.4.2
+    # via google-auth
+pydantic==2.11.7
+    # via
+    #   fastapi
+    #   fastapi-cloud-cli
+    #   google-genai
+pydantic-core==2.33.2
+    # via pydantic
+pyee==13.0.0
+    # via playwright
+pygments==2.19.2
+    # via rich
+pyproj==3.7.1
+    # via acmmm25-grand-challenge-geolocation
+python-dateutil==2.9.0.post0
+    # via pandas
+python-dotenv==1.1.1
+    # via
+    #   acmmm25-grand-challenge-geolocation
+    #   moviepy
+    #   uvicorn
+python-multipart==0.0.20
+    # via fastapi
+pytz==2025.2
+    # via pandas
+pyyaml==6.0.2
+    # via
+    #   acmmm25-grand-challenge-geolocation
+    #   huggingface-hub
+    #   timm
+    #   transformers
+    #   uvicorn
+regex==2024.11.6
+    # via
+    #   open-clip-torch
+    #   tiktoken
+    #   transformers
+requests==2.32.4
+    # via
+    #   google-api-core
+    #   google-genai
+    #   huggingface-hub
+    #   katna
+    #   tiktoken
+    #   transformers
+rich==14.0.0
+    # via
+    #   rich-toolkit
+    #   typer
+rich-toolkit==0.14.8
+    # via
+    #   fastapi-cli
+    #   fastapi-cloud-cli
+rignore==0.6.2
+    # via fastapi-cloud-cli
+rsa==4.9.1
+    # via google-auth
+safetensors==0.5.3
+    # via
+    #   open-clip-torch
+    #   timm
+    #   transformers
+scikit-image==0.25.2
+    # via katna
+scikit-learn==1.7.0
+    # via
+    #   acmmm25-grand-challenge-geolocation
+    #   katna
+scipy==1.16.0
+    # via
+    #   katna
+    #   scikit-image
+    #   scikit-learn
+sentry-sdk==2.33.0
+    # via fastapi-cloud-cli
+setuptools==80.9.0
+    # via
+    #   torch
+    #   triton
+shellingham==1.5.4
+    # via typer
+six==1.17.0
+    # via python-dateutil
+sniffio==1.3.1
+    # via anyio
+starlette==0.47.1
+    # via fastapi
+sympy==1.14.0
+    # via torch
+tenacity==8.5.0
+    # via google-genai
+threadpoolctl==3.6.0
+    # via scikit-learn
+tifffile==2025.6.11
+    # via scikit-image
+tiktoken==0.9.0
+    # via openai-whisper
+timm==1.0.17
+    # via open-clip-torch
+tokenizers==0.21.2
+    # via transformers
+torch==2.7.1
+    # via
+    #   acmmm25-grand-challenge-geolocation
+    #   open-clip-torch
+    #   openai-whisper
+    #   timm
+    #   torchvision
+torchvision==0.22.1
+    # via
+    #   acmmm25-grand-challenge-geolocation
+    #   open-clip-torch
+    #   timm
+tqdm==4.67.1
+    # via
+    #   acmmm25-grand-challenge-geolocation
+    #   huggingface-hub
+    #   open-clip-torch
+    #   openai-whisper
+    #   proglog
+    #   transformers
+transformers==4.53.2
+    # via acmmm25-grand-challenge-geolocation
+triton==3.3.1 ; (platform_machine == 'x86_64' and sys_platform == 'linux') or sys_platform == 'linux2'
+    # via
+    #   openai-whisper
+    #   torch
+typer==0.16.0
+    # via
+    #   fastapi-cli
+    #   fastapi-cloud-cli
+typing-extensions==4.14.1
+    # via
+    #   anyio
+    #   fastapi
+    #   google-genai
+    #   huggingface-hub
+    #   pydantic
+    #   pydantic-core
+    #   pyee
+    #   rich-toolkit
+    #   starlette
+    #   torch
+    #   typer
+    #   typing-inspection
+typing-inspection==0.4.1
+    # via pydantic
+tzdata==2025.2
+    # via pandas
+urllib3==2.5.0
+    # via
+    #   requests
+    #   sentry-sdk
+uvicorn==0.35.0
+    # via
+    #   fastapi
+    #   fastapi-cli
+    #   fastapi-cloud-cli
+uvloop==0.21.0 ; platform_python_implementation != 'PyPy' and sys_platform != 'cygwin' and sys_platform != 'win32'
+    # via uvicorn
+watchfiles==1.1.0
+    # via uvicorn
+wcwidth==0.2.13
+    # via ftfy
+websockets==15.0.1
+    # via
+    #   google-genai
+    #   uvicorn

src/data_processor.py ADDED Viewed

	@@ -0,0 +1,488 @@

+import asyncio
+import json
+import logging
+import os
+import hashlib
+import shutil
+from pathlib import Path
+import faiss
+import torch
+from PIL import Image
+from torch import nn
+from .prompt.fetch.content_fetch import fetch_links_to_json
+from .prompt.fetch.satellite_fetch import fetch_satellite_image
+from .prompt.preprocess.keyframe_extract import extract_and_save_keyframes
+from .prompt.preprocess.video_transcribe import transcribe_video_directory
+from .prompt.search.image_search import image_search_directory
+from .prompt.search.index_search import save_results_to_json, search_index_directory
+from .prompt.search.text_search import text_search_image, text_search_link
+logger = logging.getLogger("uvicorn.error")
+class DataProcessor:
+    def __init__(
+        self,
+        model: nn.Module,
+        input_dir: Path,
+        prompt_dir: Path,
+        cache_dir: Path,
+        image_dir: Path,
+        audio_dir: Path,
+        index_path: Path,
+        database_csv_path: Path,
+        device: torch.device,
+    ):
+        self.input_dir = input_dir
+        self.prompt_dir = prompt_dir
+        self.cache_dir = cache_dir
+        self.image_dir = image_dir
+        self.audio_dir = audio_dir
+        self.model = model
+        self.device = device
+        self.database_csv_path = database_csv_path
+        try:
+            self.index = faiss.read_index(str(index_path))
+            logger.info(f"✅ Successfully loaded FAISS index from: {index_path}")
+        except Exception as e:
+            raise RuntimeError(f"Failed to load FAISS index from {index_path}: {e}")
+        self.image_extension = {
+            ".jpg",
+            ".jpeg",
+            ".png",
+            ".bmp",
+            ".tiff",
+            ".tif",
+            ".webp",
+        }
+        self.video_extension = {
+            ".mp4",
+            ".avi",
+            ".mov",
+            ".mkv",
+        }
+    def __extract_keyframes(self):
+        """
+        Extract keyframes from all videos in the input directory.
+        Put all images and keyframes into the prompt directory.
+        """
+        output_dir = self.image_dir
+        os.makedirs(output_dir, exist_ok=True)
+        # Determine starting index based on existing files
+        current_files = list(output_dir.glob("image_*.*"))
+        idx = len(current_files)
+        # Process images
+        for file_name in os.listdir(self.input_dir):
+            file_path = os.path.join(self.input_dir, file_name)
+            if os.path.isfile(file_path) and file_name.lower().endswith(
+                tuple(self.image_extension)
+            ):
+                out_path = output_dir / f"image_{idx:03d}.jpg"
+                Image.open(file_path).convert("RGB").save(out_path)
+                idx += 1
+        # Process videos
+        for file_name in os.listdir(self.input_dir):
+            file_path = os.path.join(self.input_dir, file_name)
+            if os.path.isfile(file_path) and file_name.lower().endswith(
+                tuple(self.video_extension)
+            ):
+                if idx is None:
+                    idx = 0
+                idx = extract_and_save_keyframes(
+                    video_path=file_path, output_dir=str(output_dir), start_index=idx
+                )
+        logger.info(f"✅ Extracted keyframes and images to: {output_dir}")
+    def __transcribe_videos(self):
+        """
+        Transcribe all videos in the input directory.
+        Save transcripts into the prompt directory.
+        """
+        audio_dir = self.audio_dir
+        os.makedirs(audio_dir, exist_ok=True)
+        if audio_dir.is_dir() and any(audio_dir.iterdir()):
+            logger.info(f"🔄 Found existing transcripts in directory: {audio_dir}")
+            return
+        transcribe_video_directory(
+            video_dir=str(self.input_dir),
+            output_dir=str(audio_dir),
+            model_name="base",  # Use the base Whisper model for transcription
+        )
+        logger.info(f"✅ Successfully transcribed videos to: {audio_dir}")
+    def __image_search(self):
+        """
+        Perform image search on all images in the input directory.
+        Save search results into the prompt directory.
+        """
+        image_dir = self.image_dir
+        if os.environ["IMGBB_API_KEY"] is None:
+            raise ValueError(
+                "IMGBB_API_KEY environment variable is not set or is None."
+            )
+        if os.environ["SCRAPINGDOG_API_KEY"] is None:
+            raise ValueError(
+                "SCRAPINGDOG_API_KEY environment variable is not set or is None."
+            )
+        image_search_directory(
+            directory=str(image_dir),
+            output_dir=str(self.prompt_dir),
+            filename="metadata.json",
+            imgbb_key=os.environ["IMGBB_API_KEY"],
+            scrapingdog_key=os.environ["SCRAPINGDOG_API_KEY"],
+            max_workers=4,
+            target_links=20,
+        )
+        logger.info(f"✅ Successfully performed image search on: {image_dir}")
+    def __text_search(self):
+        """
+        Perform text search with metadata to get related links.
+        """
+        query = ""
+        metadata_file = self.prompt_dir / "metadata.json"
+        if not metadata_file.exists():
+            query = ""
+        else:
+            with open(metadata_file, "r") as f:
+                metadata = json.load(f)
+                description = metadata.get("description", "")
+                location = metadata.get("location", "")
+                query = f"{description} in {location}".strip()
+        text_search_link(
+            query=query,
+            output_dir=str(self.prompt_dir),
+            filename="text_search.json",
+            num_results=10,
+            api_key=os.environ["GOOGLE_CLOUD_API_KEY"],
+            cx=os.environ["GOOGLE_CSE_CX"],
+        )
+    async def __fetch_related_link_content(
+        self, image_prediction: bool = True, text_prediction: bool = True
+    ):
+        """
+        Fetch related link content for all images and text in the prompt directory.
+        """
+        async def fetch_and_save_links(links, output_filename):
+            if links:
+                await fetch_links_to_json(
+                    links=list(links),
+                    output_path=str(self.prompt_dir / output_filename),
+                    max_content_length=5000,
+                )
+                logger.info(
+                    f"Fetched content for {len(links)} links into {output_filename}"
+                )
+        # Image links
+        image_links = set()
+        image_search_file = self.prompt_dir / "metadata.json"
+        if image_prediction:
+            if not image_search_file.exists():
+                self.__image_search()
+            with open(image_search_file, "r") as f:
+                image_search_data = json.load(f)
+                image_links.update(image_search_data.get("all_links", []))
+            logger.info(f"Found {len(image_links)} image links to fetch content from.")
+            await fetch_and_save_links(image_links, "image_search_content.json")
+        # Text links
+        text_links = set()
+        text_search_file = self.prompt_dir / "text_search.json"
+        if text_prediction:
+            if not text_search_file.exists():
+                self.__text_search()
+            with open(text_search_file, "r") as f:
+                text_search_data = json.load(f)
+                text_links.update(filter(None, text_search_data.get("links", [])))
+            logger.info(f"Found {len(text_links)} text links to fetch content from.")
+            await fetch_and_save_links(text_links, "text_search_content.json")
+        if not image_links and not text_links:
+            logger.info("No links found in image or text search results.")
+    def __index_search(self):
+        """
+        Perform FAISS index search on all images in the prompt directory.
+        Save search results into the report directory.
+        """
+        if not self.index:
+            raise RuntimeError(
+                "FAISS index is not loaded. Cannot perform index search."
+            )
+        output_path = self.prompt_dir / "index_search.json"
+        if output_path.exists():
+            logger.info(
+                f"Index search results already exist at {output_path}, skipping search."
+            )
+            return
+        if not os.path.exists(self.database_csv_path):
+            raise FileNotFoundError(
+                f"Database CSV file not found: {self.database_csv_path}"
+            )
+        candidates_gps, reverse_gps = search_index_directory(
+            model=self.model,
+            device=self.device,
+            index=self.index,
+            image_dir=str(self.image_dir),
+            database_csv_path=str(self.database_csv_path),
+            top_k=20,
+            max_elements=20,
+        )
+        save_results_to_json(candidates_gps, reverse_gps, str(output_path))
+        logger.info(
+            f"✅ Successfully performed index search. Results saved to: {output_path}"
+        )
+    async def __fetch_satellite_image_async(
+        self,
+        latitude: float,
+        longitude: float,
+        zoom: int,
+        output_path: Path,
+    ) -> None:
+        """
+        Asynchronously fetches a satellite image without blocking the event loop.
+        Runs the synchronous `fetch_satellite_image` function in a background thread.
+        Args:
+            latitude (float): Latitude of the location.
+            longitude (float): Longitude of the location.
+            zoom (int): Zoom level of the satellite image.
+            output_path (Path): Path to save the image file.
+        """
+        await asyncio.to_thread(
+            fetch_satellite_image,
+            latitude,
+            longitude,
+            zoom,
+            str(output_path),
+        )
+    async def __search_images_async(
+        self,
+        location: str,
+        num_images: int,
+        api_key: str | None,
+        cse_cx: str | None,
+        output_dir: Path,
+        image_id_offset: int,
+    ) -> list[str]:
+        """
+        Asynchronously searches for images based on a text location query.
+        Args:
+            location (str): Text location to search.
+            num_images (int): Number of images to fetch.
+            api_key (str): Google Cloud API key.
+            cse_cx (str): Google Custom Search Engine ID.
+            output_dir (Path): Directory where images will be saved.
+            image_id_offset (int): Offset for image filenames.
+        Returns:
+            Any: The result of `text_search_image`, if it returns a value.
+        """
+        return await asyncio.to_thread(
+            text_search_image,
+            location,
+            num_images,
+            api_key,
+            cse_cx,
+            str(output_dir),
+            image_id_offset,
+        )
+    def __compute_sha256(self, filepath: Path) -> str:
+        """
+        Compute the SHA-256 hash of a file.
+        """
+        if not filepath.is_file():
+            raise ValueError(f"File does not exist: {filepath}")
+        sha256 = hashlib.sha256()
+        with open(filepath, "rb") as f:
+            for chunk in iter(lambda: f.read(4096), b""):
+                sha256.update(chunk)
+        return sha256.hexdigest()
+    def __compare_directories(self, dir1: Path, dir2: Path) -> bool:
+        """
+        Compare two directories to check if they contain the same files with identical content.
+        Args:
+            dir1 (Path): First directory to compare.
+            dir2 (Path): Second directory to compare.
+        Returns:
+            bool: True if both directories contain the same files with identical content, False otherwise.
+        """
+        if not dir1.is_dir() or not dir2.is_dir():
+            return False
+        files1 = sorted(p for p in dir1.iterdir() if p.is_file())
+        files2 = sorted(p for p in dir2.iterdir() if p.is_file())
+        # Check if filenames match exactly
+        names1 = {p.name for p in files1}
+        names2 = {p.name for p in files2}
+        if names1 != names2:
+            return False
+        # Compare each matching file
+        for filename in names1:
+            path1 = dir1 / filename
+            path2 = dir2 / filename
+            # Skip directories
+            if not path1.is_file() or not path2.is_file():
+                continue
+            hash1 = self.__compute_sha256(path1)
+            hash2 = self.__compute_sha256(path2)
+            if hash1 != hash2:
+                return False  # Found mismatch
+        return True  # All matching files are identical
+    def __copy_directory(self, src: Path, dest: Path):
+        """
+        Recursively copy all files from src to dest.
+        """
+        if not src.is_dir():
+            raise ValueError(f"Source path is not a directory: {src}")
+        # Delete everything in dest first
+        if dest.exists():
+            for item in dest.iterdir():
+                if item.is_file() or item.is_symlink():
+                    item.unlink()
+                elif item.is_dir():
+                    shutil.rmtree(item)
+        # Ensure dest exists
+        dest.mkdir(parents=True, exist_ok=True)
+        for item in src.iterdir():
+            if item.is_dir():
+                self.__copy_directory(item, dest / item.name)
+            else:
+                dest_file = dest / item.name
+                if not dest_file.exists() or not self.__compare_directories(
+                    item, dest_file
+                ):
+                    shutil.copy2(item, dest_file)
+    async def preprocess_input_data(
+        self,
+        image_prediction: bool = True,
+        text_prediction: bool = True,
+    ):
+        """
+        Preprocess all input data:
+        - Extract keyframes from videos.
+        - Transcribe videos.
+        - Fetch related link content from images.
+        Save images and extracted keyframes into the output directory
+        """
+        os.makedirs(self.prompt_dir, exist_ok=True)
+        os.makedirs(self.cache_dir, exist_ok=True)
+        cache_dir_input = self.cache_dir / "input_data"
+        cache_dir_prompt = self.cache_dir / "prompt_data"
+        if self.__compare_directories(self.input_dir, cache_dir_input):
+            logger.info("Input data already processed, skipping...")
+            self.__copy_directory(cache_dir_prompt, self.prompt_dir)
+            return
+        else:
+            logger.info("Processing input data...")
+        metadata_dest = self.prompt_dir / "metadata.json"
+        if not metadata_dest.exists():
+            for file in os.listdir(self.input_dir):
+                if file.endswith(".json"):
+                    file_path = os.path.join(self.input_dir, file)
+                    with open(file_path, "r") as src_file:
+                        with open(metadata_dest, "w") as dest_file:
+                            dest_file.write(src_file.read())
+                    break
+        self.__extract_keyframes()
+        self.__transcribe_videos()
+        await self.__fetch_related_link_content(
+            image_prediction=image_prediction, text_prediction=text_prediction
+        )
+        self.__index_search()
+        logger.info("✅ Preprocessing completed")
+        logger.info(f"Saving processed data to cache directory: {self.cache_dir}")
+        self.__copy_directory(self.input_dir, cache_dir_input)
+        self.__copy_directory(self.prompt_dir, cache_dir_prompt)
+    async def prepare_location_images(
+        self,
+        prediction: dict,
+        image_prediction: bool = True,
+        text_prediction: bool = True,
+    ) -> int:
+        """
+        Prepare verification data from the prediction with parallel fetching.
+        Args:
+            prediction (dict): Prediction dictionary with latitude, longitude, location, reason, and metadata
+            image_prediction (bool): Whether to include original images in verification
+            text_prediction (bool): Whether to include text-based verification
+        Returns:
+            int: Satellite image ID for reference in prompts
+        """
+        image_dir = self.image_dir
+        satellite_image_id = len(list(self.image_dir.glob("image_*.*")))
+        # Execute both operations in parallel
+        logger.info("🔄 Fetching satellite image and location images in parallel...")
+        # Ensure required API keys are present
+        if not os.environ.get("GOOGLE_CLOUD_API_KEY"):
+            raise ValueError(
+                "GOOGLE_CLOUD_API_KEY environment variable is not set or is None."
+            )
+        if not os.environ.get("GOOGLE_CSE_CX"):
+            raise ValueError(
+                "GOOGLE_CSE_CX environment variable is not set or is None."
+            )
+        await asyncio.gather(
+            self.__fetch_satellite_image_async(
+                prediction["latitude"],
+                prediction["longitude"],
+                zoom=200,
+                output_path=image_dir / f"image_{satellite_image_id:03d}.jpg",
+            ),
+            self.__search_images_async(
+                location=prediction["location"],
+                num_images=5,
+                api_key=os.environ["GOOGLE_CLOUD_API_KEY"],
+                cse_cx=os.environ["GOOGLE_CSE_CX"],
+                output_dir=image_dir,
+                image_id_offset=satellite_image_id + 1,
+            ),
+        )
+        logger.info("✅ Verification data preparation completed")
+        return satellite_image_id

src/g3/G3.py ADDED Viewed

	@@ -0,0 +1,134 @@

+from typing import cast
+import torch
+import torch.nn as nn
+from transformers import CLIPImageProcessor, CLIPModel, CLIPTokenizer
+from .locationencoder import LocationEncoder
+class G3(torch.nn.Module):
+    def __init__(
+        self,
+        device: str,
+        positional_encoding_type: str = "sh",
+        neural_network_type: str = "siren",
+        hparams: dict | None = None,
+    ):
+        super(G3, self).__init__()
+        self.device = device
+        clip_model = cast(CLIPModel, CLIPModel.from_pretrained("openai/clip-vit-large-patch14"))
+        self.vision_model = clip_model.vision_model
+        self.text_model = clip_model.text_model
+        self.vision_processor = cast(CLIPImageProcessor, CLIPImageProcessor.from_pretrained(
+            "openai/clip-vit-large-patch14"
+        ))
+        self.text_processor = cast(CLIPTokenizer, CLIPTokenizer.from_pretrained(
+            "openai/clip-vit-large-patch14"
+        ))
+        self.vision_projection = clip_model.visual_projection
+        self.text_projection = clip_model.text_projection
+        self.logit_scale1 = nn.Parameter(torch.tensor(3.99))
+        self.logit_scale2 = nn.Parameter(torch.tensor(3.99))
+        self.logit_scale3 = nn.Parameter(torch.tensor(3.99))
+        self.location_encoder = LocationEncoder(
+            positional_encoding_type=positional_encoding_type.split("_")[0],
+            neural_network_type=neural_network_type,
+            hparams=hparams,
+            device=device,
+        )  # output batch_size, 3, 512
+        self.vision_projection_else_1 = nn.Sequential(
+            nn.Linear(768, 768), nn.ReLU(), nn.Linear(768, 768)
+        )
+        self.text_projection_else = nn.Sequential(
+            nn.Linear(768, 768), nn.ReLU(), nn.Linear(768, 768)
+        )
+        self.vision_projection_else_2 = nn.Sequential(
+            nn.Linear(768, 768), nn.ReLU(), nn.Linear(768, 768)
+        )
+        self.location_projection_else = nn.Sequential(
+            nn.Linear(512, 512), nn.ReLU(), nn.Linear(512, 768)
+        )
+        # output_dim = 512 if hparams is None else hparams["output_dim"]
+        # self.location_projection_else = nn.Sequential(nn.Linear(output_dim, output_dim), nn.ReLU(), nn.Linear(output_dim, 768))
+        # freeze CLIP
+        self.vision_model.requires_grad_(False)
+        self.vision_projection.requires_grad_(False)
+        self.text_model.requires_grad_(False)
+        self.text_projection.requires_grad_(False)
+    def forward(self, images, texts, longitude, latitude):
+        vision_output = self.vision_model(images)[1]
+        text_output = self.text_model(**texts)[1]
+        image_embeds = self.vision_projection(vision_output)
+        text_embeds = self.text_projection(text_output)  # batch_size, 512
+        this_batch_locations = torch.stack((latitude, longitude), dim=1)
+        location_embeds = self.location_encoder(this_batch_locations)
+        # phase _1
+        image_embeds_1 = self.vision_projection_else_1(image_embeds)
+        text_embeds_1 = self.text_projection_else(
+            text_embeds.reshape(text_embeds.shape[0], -1)
+        )
+        # normalized features
+        image_embeds_1 = image_embeds_1 / image_embeds_1.norm(p=2, dim=-1, keepdim=True)
+        text_embeds_1 = text_embeds_1 / text_embeds_1.norm(p=2, dim=-1, keepdim=True)
+        # image with texts
+        logit_scale = self.logit_scale1.exp()
+        logits_per_texts_with_images = (
+            torch.matmul(text_embeds_1, image_embeds_1.t()) * logit_scale
+        )
+        logits_per_images_with_texts = logits_per_texts_with_images.t()
+        loss_phase_1 = self.clip_loss(logits_per_texts_with_images)
+        # phase _2
+        image_embeds_2 = self.vision_projection_else_2(image_embeds)
+        location_embeds_2 = self.location_projection_else(
+            location_embeds.reshape(location_embeds.shape[0], -1)
+        )
+        # normalized features
+        image_embeds_2 = image_embeds_2 / image_embeds_2.norm(p=2, dim=-1, keepdim=True)
+        location_embeds_2 = location_embeds_2 / location_embeds_2.norm(
+            p=2, dim=-1, keepdim=True
+        )
+        # image with location
+        logit_scale = self.logit_scale2.exp()
+        logits_per_locations_with_images = (
+            torch.matmul(location_embeds_2, image_embeds_2.t()) * logit_scale
+        )
+        logits_per_images_with_locations = logits_per_locations_with_images.t()
+        loss_phase_2 = None
+        loss_phase_2 = self.clip_loss(logits_per_locations_with_images)
+        loss = loss_phase_1 + loss_phase_2
+        return {
+            "logits_per_texts_with_images": logits_per_texts_with_images,
+            "logits_per_images_with_texts": logits_per_images_with_texts,
+            "logits_per_locations_with_images": logits_per_locations_with_images,
+            "logits_per_images_with_locations": logits_per_images_with_locations,
+            "logits_per_locations_with_texts": None,
+            "logits_per_texts_with_locations": None,
+            "loss": loss,
+            "vision_output": vision_output,
+            "text_output": text_output,
+            "image_embeds": image_embeds,
+            "text_embeds": text_embeds,
+        }
+    def contrastive_loss(self, logits: torch.Tensor) -> torch.Tensor:
+        return nn.functional.cross_entropy(
+            logits, torch.arange(len(logits), device=logits.device)
+        )
+    def clip_loss(self, similarity: torch.Tensor) -> torch.Tensor:
+        caption_loss = self.contrastive_loss(similarity)
+        image_loss = self.contrastive_loss(similarity.t())
+        return (caption_loss + image_loss) / 2.0

src/g3/dataset.py ADDED Viewed

	@@ -0,0 +1,407 @@

+import os
+import pickle
+import tarfile
+from io import BytesIO
+from pathlib import Path
+from typing import Callable, Optional
+import numpy as np
+import pandas as pd
+import torch
+import torchvision.transforms as T
+import transformers
+from PIL import Image, ImageFile
+from torch.utils.data import DataLoader, get_worker_info
+from torchvision.datasets import VisionDataset
+from torchvision.io import ImageReadMode, read_image
+from tqdm import tqdm
+from transformers import (
+    CLIPImageProcessor,
+    CLIPModel,
+    CLIPTextModel,
+    CLIPTokenizer,
+    CLIPVisionModel,
+)
+ImageFile.LOAD_TRUNCATED_IMAGES = True  # Allow truncated images to be loaded
+from io import BytesIO
+from typing import Any, Dict, Iterator, Optional, Tuple
+import torch
+import torchvision.transforms as T
+from datasets import load_dataset
+from huggingface_hub import login
+from PIL import Image
+from torch.utils.data import DataLoader, IterableDataset, get_worker_info
+__all__ = [
+    "MP16StreamingDataset",
+    "mp16_collate",
+]
+class MP16StreamingDataset(IterableDataset):
+    """Stream **MP‑16** samples from the HuggingFace Hub and yield a simple
+    tuple per example::
+        (image, text, longitude, latitude)
+    * **image**  – either a tensor (``C×H×W``) if *vision_processor* is set or if
+      the fallback transform is used, otherwise a PIL image.
+    * **text**   – caption string (either provided by the dataset or generated
+      from location fields).
+    * **longitude**, **latitude** – floats.
+    The class is an :class:`torch.utils.data.IterableDataset`, so wrap it in a
+    :class:`~torch.utils.data.DataLoader` for batching.
+    """
+    def __init__(
+        self,
+        repo_id: str = "tduongvn/MP16-Pro-shards",
+        split: str = "train",
+        vision_processor: Optional[Any] = None,
+        shuffle_buffer: int = 10_000,
+        HF_TOKEN: Optional[str] = None,
+    ) -> None:
+        super().__init__()
+        self.repo_id = repo_id
+        self.split = split
+        self.vision_processor = vision_processor
+        self.shuffle_buffer = shuffle_buffer
+        self.HF_TOKEN = HF_TOKEN
+        # Base transform when we *don't* have a fancy processor
+        self.fallback_transform = T.Compose(
+            [
+                T.RandomHorizontalFlip(),
+                T.RandomResizedCrop(size=224),
+                T.ToTensor(),
+            ]
+        )
+        # Prepare an initial dataset iterator for the main process
+        self._base_iter = self._new_iterator()
+    # ──────────────────────────────────────────────────────────────────────────
+    # Internals                                                               ─┘
+    def _new_iterator(self):
+        if self.HF_TOKEN is not None:
+            login(token=self.HF_TOKEN)
+        return (
+            load_dataset(self.repo_id, split=self.split, streaming=True)
+            .shuffle(buffer_size=self.shuffle_buffer)
+            .__iter__()
+        )
+    def _decode_image(self, img_bytes):
+        """bytes → PIL.Image or tensor (if processor is set)."""
+        img = Image.open(BytesIO(img_bytes)).convert("RGB")
+        if self.vision_processor is not None:
+            return self.vision_processor(images=img, return_tensors="pt")[
+                "pixel_values"
+            ].squeeze(0)
+        return self.fallback_transform(img)
+    def _caption(self, ex_json: Dict[str, Any]) -> str:
+        parts = [ex_json.get(k) for k in ("city", "state", "country") if ex_json.get(k)]
+        return "A street view photo taken in " + ", ".join(parts)
+    # ──────────────────────────────────────────────────────────────────────────
+    # IterableDataset API                                                     ─┘
+    def __iter__(self) -> Iterator[Tuple[Any, str, float, float]]:
+        # Each DataLoader worker gets its own iterator to avoid state clashes.
+        worker = get_worker_info()
+        iterator = self._new_iterator() if worker is not None else self._base_iter
+        for ex in iterator:
+            # Dataset structure: {'jpg': <PIL or bytes>, 'json': {...}, ...}
+            img_field = ex["jpg"]
+            if isinstance(img_field, Image.Image):
+                img = img_field.convert("RGB")
+                if self.vision_processor is not None:
+                    img = self.vision_processor(images=img, return_tensors="pt")[
+                        "pixel_values"
+                    ].squeeze(0)
+                else:
+                    img = self.fallback_transform(img)
+            else:  # bytes
+                img = self._decode_image(img_field)
+            meta = ex["json"] if "json" in ex else {}
+            lon = float(meta.get("lon", meta.get("LON")))
+            lat = float(meta.get("lat", meta.get("LAT")))
+            text = meta.get("text") or self._caption(meta)
+            yield img, text, lon, lat
+    # No __len__ – this is a stream.
+# ─────────────────────────────────────────────────────────────────────────────
+# Collate                                                                     ─┘
+def make_mp16_collate(text_processor):
+    def collate(batch):
+        images, texts, lons, lats = zip(*batch)
+        images = torch.stack(images)  # (B, C, H, W)
+        token_out = text_processor(
+            list(texts),
+            padding="longest",
+            truncation=True,
+            max_length=77,
+            return_tensors="pt",
+        )
+        lons = torch.tensor(lons, dtype=torch.float32)
+        lats = torch.tensor(lats, dtype=torch.float32)
+        return images, token_out, lons, lats
+    return collate
+class MP16Dataset(VisionDataset):
+    def __init__(
+        self,
+        root_path="data/mp16/",
+        text_data_path="MP16_Pro_places365.csv",
+        image_data_path="mp-16-images.tar",
+        member_info_path="tar_index.pkl",
+        vision_processor=None,
+        text_processor=None,
+    ):
+        super().__init__(self)
+        self.root_path = root_path
+        self.text_data_path = text_data_path
+        self.image_data_path = image_data_path
+        self.text_data = pd.read_csv(os.path.join(self.root_path, self.text_data_path))
+        self.text_data["IMG_ID"] = self.text_data["IMG_ID"].apply(
+            lambda x: x.replace("/", "_")
+        )
+        # self.text_data = self.text_data[self.text_data['IMG_ID'].str.endswith('.jpg')] # only keep jpg images
+        print("read text data success")
+        worker = get_worker_info()
+        worker = worker.id if worker else None
+        self.tar_obj = {worker: tarfile.open(os.path.join(root_path, image_data_path))}
+        # self.tar = tarfile.open(os.path.join(root_path, image_data_path))
+        if os.path.exists(os.path.join(self.root_path, member_info_path)):
+            with open(os.path.join(self.root_path, member_info_path), "rb") as f:
+                self.tar_index = pickle.load(f)
+            all_image_names = list(self.tar_index.keys())
+            print("load tar index success")
+        else:
+            print("no exist tar index success, need building...")
+            self.tar_index = {}
+            all_image_names = []
+            for member in tqdm(self.tar_obj[worker]):
+                if member.name.endswith(".jpg") and member.size > 5120:
+                    self.tar_index[member.name.split("/")[1]] = member
+                    all_image_names.append(member.name.split("/")[1])
+            print("tar index buidling success")
+            with open(os.path.join(self.root_path, member_info_path), "wb") as f:
+                pickle.dump(self.tar_index, f)
+        all_image_names = set(all_image_names)
+        self.text_data = self.text_data[self.text_data["country"].notnull()]
+        self.text_data = self.text_data[self.text_data["IMG_ID"].isin(all_image_names)]
+        print("data columns: ", self.text_data.shape[0])
+        # location from str to float
+        self.text_data.loc[:, "LON"] = self.text_data["LON"].astype(float)
+        self.text_data.loc[:, "LAT"] = self.text_data["LAT"].astype(float)
+        print("location from str to float success")
+        # image transform
+        self.transform = T.Resize(size=(512, 512))
+        self.transform_totensor = T.ToTensor()
+        self.vision_processor = vision_processor
+        self.text_processor = text_processor
+        # Define the contrast transforms here
+        self.contrast_transforms = T.Compose(
+            [
+                T.RandomHorizontalFlip(),
+                T.RandomResizedCrop(size=224),
+                T.RandomApply(
+                    [
+                        T.ColorJitter(
+                            brightness=0.5, contrast=0.5, saturation=0.5, hue=0.1
+                        )
+                    ],
+                    p=0.8,
+                ),
+                T.RandomGrayscale(p=0.2),
+                T.GaussianBlur(kernel_size=9),
+                T.ToTensor(),
+                # T.Normalize((0.5,), (0.5,))
+            ]
+        )
+        # self.text_data.to_csv('/data/mp-16/MP16_Pro_filtered.csv', index=False)
+    def caption_generation(self, row):
+        pass
+    def __getitem__(self, index):
+        image_path = self.text_data.iloc[index]["IMG_ID"]
+        text = ""
+        neighbourhood, city, county, state, region, country, continent = (
+            self.text_data.iloc[index][
+                [
+                    "neighbourhood",
+                    "city",
+                    "county",
+                    "state",
+                    "region",
+                    "country",
+                    "continent",
+                ]
+            ]
+        )
+        # location_elements = [element for element in [neighbourhood, city, state, country] if element is not np.nan and str(element) != 'nan']
+        location_elements = [
+            element
+            for element in [city, state, country]
+            if element is not np.nan and str(element) != "nan"
+        ]
+        text = "A street view photo taken in " + ", ".join(location_elements)
+        longitude = self.text_data.iloc[index]["LON"]
+        latitude = self.text_data.iloc[index]["LAT"]
+        # read the image from self.tar
+        worker = get_worker_info()
+        worker = worker.id if worker else None
+        if worker not in self.tar_obj:
+            self.tar_obj[worker] = tarfile.open(
+                os.path.join(self.root_path, self.image_data_path)
+            )
+        image = self.tar_obj[worker].extractfile(self.tar_index[image_path])
+        image = Image.open(image)
+        if image.mode != "RGB":
+            image = image.convert("RGB")
+        if self.vision_processor:
+            image = self.vision_processor(images=image, return_tensors="pt")[
+                "pixel_values"
+            ].reshape(3, 224, 224)
+        return image, text, longitude, latitude
+    def __len__(self):
+        return len(self.text_data)
+class im2gps3kDataset(VisionDataset):
+    def __init__(
+        self,
+        root_path="./data/im2gps3k",
+        text_data_path="im2gps3k_places365.csv",
+        image_data_path="images/",
+        vision_processor=None,
+        text_processor=None,
+    ):
+        super().__init__(self)
+        print("start loading im2gps...")
+        self.root_path = root_path
+        self.text_data_path = text_data_path
+        self.image_data_path = image_data_path
+        self.text_data = pd.read_csv(os.path.join(self.root_path, self.text_data_path))
+        # self.text_data = self.text_data[self.text_data['IMG_ID'].str.endswith('.jpg')] # only keep jpg images
+        print("read text data success")
+        # location from str to float
+        self.text_data.loc[:, "LAT"] = self.text_data["LAT"].astype(float)
+        self.text_data.loc[:, "LON"] = self.text_data["LON"].astype(float)
+        print("location from str to float success")
+        self.vision_processor = vision_processor
+        self.text_processor = text_processor
+        self.tencrop = T.TenCrop(224)
+    def __getitem__(self, index):
+        image_path = self.text_data.iloc[index]["IMG_ID"]
+        text = image_path
+        longitude = self.text_data.iloc[index]["LON"]
+        latitude = self.text_data.iloc[index]["LAT"]
+        image = Image.open(
+            os.path.join(self.root_path, self.image_data_path, image_path)
+        )
+        if image.mode != "RGB":
+            image = image.convert("RGB")
+        # image = self.tencrop(image) # for tencrop
+        if self.vision_processor:
+            image = self.vision_processor(images=image, return_tensors="pt")[
+                "pixel_values"
+            ].reshape(-1, 224, 224)
+        return image, text, longitude, latitude
+    def __len__(self):
+        return len(self.text_data)
+class yfcc4kDataset(VisionDataset):
+    def __init__(
+        self,
+        root_path="./data/yfcc4k",
+        text_data_path="yfcc4k_places365.csv",
+        image_data_path="images/",
+        vision_processor=None,
+        text_processor=None,
+    ):
+        super().__init__(self)
+        print("start loading yfcc4k...")
+        self.root_path = root_path
+        self.text_data_path = text_data_path
+        self.image_data_path = image_data_path
+        self.text_data = pd.read_csv(os.path.join(self.root_path, self.text_data_path))
+        # self.text_data = self.text_data[self.text_data['IMG_ID'].str.endswith('.jpg')] # only keep jpg images
+        print("read text data success")
+        # location from str to float
+        self.text_data.loc[:, "LAT"] = self.text_data["LAT"].astype(float)
+        self.text_data.loc[:, "LON"] = self.text_data["LON"].astype(float)
+        print("location from str to float success")
+        self.vision_processor = vision_processor
+        self.text_processor = text_processor
+    def __getitem__(self, index):
+        image_path = self.text_data.iloc[index]["IMG_ID"]
+        text = image_path
+        longitude = self.text_data.iloc[index]["LON"]
+        latitude = self.text_data.iloc[index]["LAT"]
+        image = Image.open(
+            os.path.join(self.root_path, self.image_data_path, image_path)
+        )
+        if image.mode != "RGB":
+            image = image.convert("RGB")
+        if self.vision_processor:
+            image = self.vision_processor(images=image, return_tensors="pt")[
+                "pixel_values"
+            ].reshape(-1, 224, 224)
+        return image, text, longitude, latitude
+    def __len__(self):
+        return len(self.text_data)

src/g3/hparams.yaml ADDED Viewed

	@@ -0,0 +1,57 @@

+# sh_siren:
+#   legendre_polys: 30
+#   harmonics_calculation: analytic
+#   hidden_dim: 512
+#   num_layers: 3
+#   lr: 7.887855321604208e-05
+#   wd: 1.3475466222160537e-06
+sh_siren:
+  legendre_polys: 40
+  harmonics_calculation: analytic
+  hidden_dim: 512
+  output_dim: 256
+  num_layers: 2
+  lr: 0.0001
+  wd: 0.01
+projection_eep_rffmlp:
+  projection: eep
+  sigma:
+    - 1
+    - 16
+    - 256
+  hidden_dim: 1024
+  lr: 0.00003
+  wd: 0.000001
+projection_mercator_rffmlp:
+  projection: mercator
+  sigma:
+    - 1
+    - 16
+    - 256
+  hidden_dim: 1024
+  lr: 0.00003
+  wd: 0.000001
+projection_ecef_rffmlp:
+  projection: ecef
+  sigma:
+    - 1
+    - 16
+    - 256
+  hidden_dim: 1024
+  lr: 0.00003
+  wd: 0.000001
+projection_eep_rffmlp:
+  projection: eep
+  sigma:
+    - 1
+    - 16
+    - 256
+  hidden_dim: 1024
+  lr: 0.00003
+  wd: 0.000001

src/g3/locationencoder.py ADDED Viewed

	@@ -0,0 +1,133 @@

+import torch
+from torch import nn
+from .nn.mlp import MLP
+from .nn.rff_mlp import RFFMLP
+from .nn.siren import SirenNet
+from .pe.projection import Projection
+from .pe.projection_rff import ProjectionRFF
+from .pe.spherical_harmonics import SphericalHarmonics
+def get_positional_encoding(positional_encoding_type, hparams, device="cuda"):
+    """
+    Returns a positional encoding module based on the specified encoding type.
+    Args:
+        encoding_type (str): The type of positional encoding to use. Options are 'rff', 'siren', 'sh', 'capsule'.
+        input_dim (int): The input dimension for the positional encoding.
+        output_dim (int): The output dimension for the positional encoding.
+        hparams: Additional arguments for specific encoding types.
+    Returns:
+        nn.Module: The positional encoding module.
+    """
+    if positional_encoding_type == "projectionrff":
+        return ProjectionRFF(
+            projection=hparams["projection"],
+            sigma=hparams["sigma"],
+            hparams=hparams,
+            device=device,
+        )
+    elif positional_encoding_type == "projection":
+        return Projection(
+            projection=hparams["projection"], hparams=hparams, device=device
+        )
+    elif positional_encoding_type == "sh":
+        return SphericalHarmonics(
+            legendre_polys=hparams["legendre_polys"],
+            harmonics_calculation=hparams["harmonics_calculation"],
+            hparams=hparams,
+            device=device,
+        )
+    else:
+        raise ValueError(f"Unsupported encoding type: {positional_encoding_type}")
+def get_neural_network(
+    neural_network_type: str,
+    input_dim: int,
+    hparams: dict,
+    device="cuda",
+):
+    """
+    Returns a neural network module based on the specified network type.
+    Args:
+        neural_network_type (str): The type of neural network to use. Options are 'siren'.
+        input_dim (int): The input dimension for the neural network.
+        output_dim (int): The output dimension for the neural network.
+        hparams: Additional arguments for specific network types.
+    Returns:
+        nn.Module: The neural network module.
+    """
+    if neural_network_type == "siren":
+        return SirenNet(
+            input_dim=input_dim,
+            output_dim=hparams["output_dim"],
+            hidden_dim=hparams["hidden_dim"],
+            num_layers=hparams["num_layers"],
+            hparams=hparams,
+            device=device,
+        )
+    elif neural_network_type == "mlp":
+        return MLP(
+            input_dim=input_dim,
+            hidden_dim=hparams["hidden_dim"],
+            hparams=hparams,
+            device=device,
+        )
+    elif neural_network_type == "rffmlp":
+        return RFFMLP(
+            input_dim=input_dim,
+            hidden_dim=hparams["hidden_dim"],
+            sigma=hparams["sigma"],
+            hparams=hparams,
+            device=device,
+        )
+    else:
+        raise ValueError(f"Unsupported network type: {neural_network_type}")
+class LocationEncoder(nn.Module):
+    def __init__(
+        self,
+        positional_encoding_type: str = "sh",
+        neural_network_type: str = "siren",
+        hparams: dict | None = None,
+        device: str = "cuda",
+    ):
+        super().__init__()
+        self.device = device
+        self.position_encoder = get_positional_encoding(
+            positional_encoding_type=positional_encoding_type,
+            hparams=hparams,
+            device=device,
+        )
+        if hparams is None:
+            hparams = {}
+        self.neural_network = nn.ModuleList(
+            [
+                get_neural_network(
+                    neural_network_type, input_dim=dim, hparams=hparams, device=device
+                )
+                for dim in self.position_encoder.embedding_dim
+            ]
+        )
+    def forward(self, x):
+        embedding = self.position_encoder(x)
+        if embedding.ndim == 2:
+            # If the embedding is (batch, n), we need to add a dimension
+            embedding = embedding.unsqueeze(0)
+        location_features = torch.zeros(embedding.shape[1], 512).to(self.device)
+        for nn, e in zip(self.neural_network, embedding):
+            location_features += nn(e)
+        return location_features

src/g3/nn/mlp.py ADDED Viewed

	@@ -0,0 +1,20 @@

+from torch import nn
+class MLP(nn.Module):
+    """Multi-layer perceptron (MLP) with batch normalization and ReLU activation."""
+    def __init__(self, input_dim=512, hidden_dim=1024, output_dim=512, hparams=None, device='cuda'):
+        super(MLP, self).__init__()
+        self.device = device
+        self.capsule = nn.Sequential(nn.Linear(input_dim, hidden_dim),
+                                     nn.ReLU(),
+                                     nn.Linear(hidden_dim, hidden_dim),
+                                     nn.ReLU(),
+                                     nn.Linear(hidden_dim, hidden_dim),
+                                     nn.ReLU())
+        self.head = nn.Sequential(nn.Linear(hidden_dim, output_dim))
+    def forward(self, x):
+        x = self.capsule(x)
+        x = self.head(x)
+        return x

src/g3/nn/rff_mlp.py ADDED Viewed

	@@ -0,0 +1,38 @@

+from torch import nn
+import torch
+from ..rff.layers import GaussianEncoding
+class LocationEncoderCapsule(nn.Module):
+    def __init__(self, input_dim=2, hidden_dim=1024, output_dim=512, sigma=2**0):
+        super(LocationEncoderCapsule, self).__init__()
+        rff_encoding = GaussianEncoding(sigma=sigma, input_size=input_dim, encoded_size=int(output_dim/2))
+        self.capsule = nn.Sequential(rff_encoding,
+                                     nn.Linear(output_dim, hidden_dim),
+                                     nn.ReLU(),
+                                     nn.Linear(hidden_dim, hidden_dim),
+                                     nn.ReLU(),
+                                     nn.Linear(hidden_dim, hidden_dim),
+                                     nn.ReLU())
+        self.head = nn.Sequential(nn.Linear(hidden_dim, output_dim))
+    def forward(self, x):
+        x = self.capsule(x)
+        x = self.head(x)
+        return x
+class RFFMLP(nn.Module):
+    """Multi-layer perceptron (MLP) with batch normalization and ReLU activation."""
+    def __init__(self, input_dim=2, hidden_dim=1024, output_dim=512, sigma=[2**0, 2**4, 2**8], hparams=None, device='cuda'):
+        super(RFFMLP, self).__init__()
+        self.num_hierarchies = len(sigma)
+        self.device = device
+        for i, s in enumerate(sigma):
+            self.add_module('LocEnc' + str(i), LocationEncoderCapsule(input_dim=input_dim, hidden_dim=hidden_dim, output_dim=output_dim, sigma=s))
+    def forward(self, input):
+        location_features = torch.zeros(input.shape[0], 512).to(self.device)
+        for i in range(self.num_hierarchies):
+            location_features += self._modules['LocEnc' + str(i)](input)
+        return location_features

src/g3/nn/siren.py ADDED Viewed

	@@ -0,0 +1,100 @@

+import math
+import torch
+from torch import nn
+import torch.nn.functional as F
+from einops import rearrange
+# helpers
+def exists(val):
+    return val is not None
+def cast_tuple(val, repeat = 1):
+    return val if isinstance(val, tuple) else ((val,) * repeat)
+# sin activation
+class Sine(nn.Module):
+    def __init__(self, w0 = 1.):
+        super().__init__()
+        self.w0 = w0
+    def forward(self, x):
+        return torch.sin(self.w0 * x)
+# siren layer
+class Siren(nn.Module):
+    def __init__(self, input_dim, output_dim, w0 = 1., c = 6., is_first = False, use_bias = True, activation = None, dropout = False):
+        super().__init__()
+        self.input_dim = input_dim
+        self.is_first = is_first
+        self.output_dim = output_dim
+        self.dropout = dropout
+        weight = torch.zeros(output_dim, input_dim)
+        bias = torch.zeros(output_dim) if use_bias else None
+        self.init_(weight, bias, c = c, w0 = w0)
+        self.weight = nn.Parameter(weight)
+        self.bias = nn.Parameter(bias) if use_bias else None
+        self.activation = Sine(w0) if activation is None else activation
+    def init_(self, weight, bias, c, w0):
+        dim = self.input_dim
+        w_std = (1 / dim) if self.is_first else (math.sqrt(c / dim) / w0)
+        weight.uniform_(-w_std, w_std)
+        if exists(bias):
+            bias.uniform_(-w_std, w_std)
+    def forward(self, x):
+        out =  F.linear(x, self.weight, self.bias)
+        if self.dropout:
+            out = F.dropout(out, training=self.training)
+        out = self.activation(out)
+        return out
+# siren network
+class SirenNet(nn.Module):
+    def __init__(self, input_dim = 512, hidden_dim = 1024, output_dim = 512, num_layers = 3, w0 = 1., w0_initial = 30., use_bias = True, final_activation = None, degreeinput = False, dropout = False, hparams=None, device='cuda'):
+        super().__init__()
+        self.num_layers = num_layers
+        self.hidden_dim = hidden_dim
+        self.degreeinput = degreeinput
+        self.device = device
+        self.layers = nn.ModuleList([])
+        for ind in range(num_layers):
+            is_first = ind == 0
+            layer_w0 = w0_initial if is_first else w0
+            layer_input_dim = input_dim if is_first else hidden_dim
+            self.layers.append(Siren(
+                input_dim = layer_input_dim,
+                output_dim = hidden_dim,
+                w0 = layer_w0,
+                use_bias = use_bias,
+                is_first = is_first,
+                dropout = dropout
+            ))
+        final_activation = nn.Identity() if not exists(final_activation) else final_activation
+        self.last_layer = Siren(input_dim = hidden_dim, output_dim = output_dim, w0 = w0, use_bias = use_bias, activation = final_activation, dropout = False)
+    def forward(self, x, mods = None):
+        # do some normalization to bring degrees in a -pi to pi range
+        if self.degreeinput:
+            x = torch.deg2rad(x) - torch.pi
+        mods = cast_tuple(mods, self.num_layers)
+        for layer, mod in zip(self.layers, mods):
+            x = layer(x)
+            if exists(mod):
+                x *= rearrange(mod, 'd -> () d')
+        return self.last_layer(x)

src/g3/pe/projection.py ADDED Viewed

	@@ -0,0 +1,54 @@

+import torch
+import torch.nn as nn
+import numpy as np
+import pandas as pd
+import itertools
+from transformers import CLIPTokenizer, CLIPImageProcessor, CLIPModel
+from torch.nn import TransformerEncoder, TransformerEncoderLayer
+from pyproj import Proj, Transformer
+SF = 66.50336
+class Projection(nn.Module):
+    def __init__(self, projection="mercator", hparams=None, device='cuda'):
+        super(Projection, self).__init__()
+        self.device = device
+        self.projection = projection.lower()
+        proj_wgs84 = Proj('epsg:4326')
+        if self.projection == "mercator":
+            proj_target = Proj('epsg:3857')
+            self.normalizer = 20037508.3427892
+            self.embedding_dim =  [2]
+        elif self.projection == "eep":
+            proj_target = Proj('epsg:8857')
+            self.normalizer = 180/SF
+            self.embedding_dim =  [2]
+        elif self.projection == "ecef":
+            proj_target = Proj('epsg:4978')
+            self.normalizer = 6378137.0  # radius of Earth, not exact for ECEF but usable
+            self.embedding_dim =  [3]
+        else:
+            raise ValueError(f"Unsupported projection: {self.projection}")
+        self.transformer = Transformer.from_proj(proj_wgs84, proj_target, always_xy=True)
+    def forward(self, input):
+        lat = input[:, 0].float().detach().cpu().numpy()
+        lon = input[:, 1].float().detach().cpu().numpy()
+        # lon (batch), lat (batch)
+        # Shape: (batch, 2) or (batch, 3) depending on projection
+        if self.projection == "ecef":
+            alt = np.zeros_like(lat)
+            projected = self.transformer.transform(lon, lat, alt)
+            location = list(zip(*projected))  # X, Y, Z
+            location = torch.Tensor(location).to(self.device)
+        else:
+            projected = self.transformer.transform(lon, lat)
+            location = [[y, x] for x, y in zip(*projected)]
+            location = torch.Tensor(location).to(self.device)
+        location = location / self.normalizer
+        return location

src/g3/pe/projection_rff.py ADDED Viewed

	@@ -0,0 +1,65 @@

+import torch
+import torch.nn as nn
+import numpy as np
+import pandas as pd
+import itertools
+from transformers import CLIPTokenizer, CLIPImageProcessor, CLIPModel
+from torch.nn import TransformerEncoder, TransformerEncoderLayer
+from ..rff.layers import GaussianEncoding
+from pyproj import Proj, Transformer
+SF = 66.50336
+class ProjectionRFF(nn.Module):
+    def __init__(self, projection="ecef", sigma=[2**0, 2**4, 2**8], hparams=None, device='cuda'):
+        super(ProjectionRFF, self).__init__()
+        self.device = device
+        self.sigma = sigma
+        self.num_hierarchies = len(self.sigma)
+        self.projection = projection.lower()
+        self.embedding_dim = [512] * self.num_hierarchies
+        proj_wgs84 = Proj('epsg:4326')
+        if self.projection == "mercator":
+            proj_target = Proj('epsg:3857')
+            input_dim = 2
+            self.normalizer = 20037508.3427892
+        elif self.projection == "eep":
+            proj_target = Proj('epsg:8857')
+            input_dim = 2
+            self.normalizer = 180/SF
+        elif self.projection == "ecef":
+            proj_target = Proj('epsg:4978')
+            input_dim = 3
+            self.normalizer = 6378137.0  # radius of Earth, not exact for ECEF but usable
+        else:
+            raise ValueError(f"Unsupported projection: {self.projection}")
+        self.transformer = Transformer.from_proj(proj_wgs84, proj_target, always_xy=True)
+        for i, s in enumerate(self.sigma):
+            self.add_module('LocEnc' + str(i), GaussianEncoding(sigma=s, input_size=input_dim, encoded_size=256))
+    def forward(self, input):
+        lat = input[:, 0].float().detach().cpu().numpy()
+        lon = input[:, 1].float().detach().cpu().numpy()
+        # lon (batch), lat (batch)
+        # Shape: (batch, 2) or (batch, 3) depending on projection
+        if self.projection == "ecef":
+            alt = np.zeros_like(lat)
+            projected = self.transformer.transform(lon, lat, alt)
+            location = list(zip(*projected))  # X, Y, Z
+            location = torch.Tensor(location).to(self.device)
+        else:
+            projected = self.transformer.transform(lon, lat)
+            location = [[y, x] for x, y in zip(*projected)]
+            location = torch.Tensor(location).to(self.device)
+        location = location / self.normalizer
+        out = []
+        for i in range(self.num_hierarchies):
+            out.append(self._modules['LocEnc' + str(i)](location))
+        location_features = torch.stack(out, dim=0) # (hierarchies, batch, 512)
+        return location_features

src/g3/pe/spherical_harmonics.py ADDED Viewed

	@@ -0,0 +1,40 @@

+import torch
+from torch import nn
+from .spherical_harmonics_ylm import SH as SH_analytic
+from .spherical_harmonics_closed_form import SH as SH_closed_form
+class SphericalHarmonics(nn.Module):
+    def __init__(self, legendre_polys: int = 20, harmonics_calculation="analytic", hparams=None, device='cuda'):
+        """
+        legendre_polys: determines the number of legendre polynomials.
+                        more polynomials lead more fine-grained resolutions
+        calculation of spherical harmonics:
+            analytic uses pre-computed equations. This is exact, but works only up to degree 50,
+            closed-form uses one equation but is computationally slower (especially for high degrees)
+        """
+        super(SphericalHarmonics, self).__init__()
+        self.device = device
+        self.L, self.M = int(legendre_polys), int(legendre_polys)
+        self.embedding_dim = [self.L * self.M]
+        if harmonics_calculation == "closed-form":
+            self.SH = SH_closed_form
+        elif harmonics_calculation == "analytic":
+            self.SH = SH_analytic
+    def forward(self, lonlat):
+        lon, lat = lonlat[:, 0], lonlat[:, 1] # lon: (batch), lat: (batch)
+        # convert degree to rad
+        phi = torch.deg2rad(lon + 180)
+        theta = torch.deg2rad(lat + 90)
+        Y = [] # (L * L, batch)
+        for l in range(self.L):
+            for m in range(-l, l + 1):
+                y = self.SH(m, l, phi, theta)
+                if isinstance(y, float):
+                    y = y * torch.ones_like(phi)
+                Y.append(y)
+        return torch.stack(Y,dim=-1).detach() # (batch, L * L)

src/g3/pe/spherical_harmonics_closed_form.py ADDED Viewed

	@@ -0,0 +1,40 @@

+import math
+import torch
+####################### Spherical Harmonics utilities ########################
+# Code copied from https://github.com/BachiLi/redner/blob/master/pyredner/utils.py
+# Code adapted from "Spherical Harmonic Lighting: The Gritty Details", Robin Green
+# http://silviojemma.com/public/papers/lighting/spherical-harmonic-lighting.pdf
+def associated_legendre_polynomial(l, m, x):
+    pmm = torch.ones_like(x)
+    if m > 0:
+        somx2 = torch.sqrt((1 - x) * (1 + x))
+        fact = 1.0
+        for i in range(1, m + 1):
+            pmm = pmm * (-fact) * somx2
+            fact += 2.0
+    if l == m:
+        return pmm
+    pmmp1 = x * (2.0 * m + 1.0) * pmm
+    if l == m + 1:
+        return pmmp1
+    pll = torch.zeros_like(x)
+    for ll in range(m + 2, l + 1):
+        pll = ((2.0 * ll - 1.0) * x * pmmp1 - (ll + m - 1.0) * pmm) / (ll - m)
+        pmm = pmmp1
+        pmmp1 = pll
+    return pll
+def SH_renormalization(l, m):
+    return math.sqrt((2.0 * l + 1.0) * math.factorial(l - m) / \
+        (4 * math.pi * math.factorial(l + m)))
+def SH(m, l, phi, theta):
+    if m == 0:
+        return SH_renormalization(l, m) * associated_legendre_polynomial(l, m, torch.cos(theta))
+    elif m > 0:
+        return math.sqrt(2.0) * SH_renormalization(l, m) * \
+            torch.cos(m * phi) * associated_legendre_polynomial(l, m, torch.cos(theta))
+    else:
+        return math.sqrt(2.0) * SH_renormalization(l, -m) * \
+            torch.sin(-m * phi) * associated_legendre_polynomial(l, -m, torch.cos(theta))

src/g3/pe/spherical_harmonics_generate_ylms.py ADDED Viewed

	@@ -0,0 +1,73 @@

+"""
+This function prints the source code for spherical_harmonics_ylms.py to console
+spherical_harmonics pre-computes the analytical solutions to each real spherical harmonic with sympy
+the script contains different functions for different degrees l and orders m
+Marc Russwurm
+"""
+from datetime import datetime
+import sys
+from sympy import assoc_legendre
+from sympy import cos, sin, sqrt, pi, factorial, Abs
+from sympy import Symbol
+theta = Symbol("theta")
+phi = Symbol("phi")
+def calc_ylm(l, m):
+  """
+  see last equation of https://en.wikipedia.org/wiki/Spherical_harmonics#Real_form
+  """
+  if m < 0:
+    Plm = assoc_legendre(l, Abs(m), cos(theta))
+    Plm_bar = sqrt(((2 * l + 1) / (4 * pi)) * (factorial(l - Abs(m)) / factorial(l + Abs(m)))) * Plm
+    Ylm = (-1)**m * sqrt(2) * Plm_bar * sin(Abs(m) * phi)
+  elif m == 0:
+    Ylm = sqrt((2*l + 1) / (4 * pi)) * assoc_legendre(l, m, cos(theta))
+  else: # m > 0
+    Plm = assoc_legendre(l, m, cos(theta))
+    Plm_bar = sqrt(((2 * l + 1) / (4 * pi)) * (factorial(l - m) / factorial(l + m))) * Plm
+    Ylm = (-1)**m * sqrt(2) * Plm_bar * cos(m * phi)
+  return Ylm
+def print_function(l, m):
+  fname = f"Yl{l}_m{m}".replace("-", "_minus_")
+  print()
+  print("@torch.jit.script")
+  print(f"def {fname}(theta, phi):")
+  print("    return " + str(calc_ylm(l, m).evalf()))
+# max number of Legendre Polynomials
+L = 101
+head = """\"\"\"
+analytic expressions of spherical harmonics generated with sympy file
+Marc Russwurm generated """ + str(datetime.date(datetime.now())) + """
+run
+python """ + sys.argv[0] + """ > spherical_harmonics_ylm.py
+to generate the source code
+\"\"\"
+import torch
+from torch import cos, sin
+def get_SH(m,l):
+  fname = f"Yl{l}_m{m}".replace("-","_minus_")
+  return globals()[fname]
+def SH(m, l, phi, theta):
+  Ylm = get_SH(m,l)
+  return Ylm(theta, phi)
+"""
+print(head)
+print()
+for l in range(L):
+  for m in range(-l,l+1):
+    print_function(l,m)

src/g3/pe/spherical_harmonics_ylm.py ADDED Viewed

The diff for this file is too large to render. See raw diff

src/g3/rff/functional.py ADDED Viewed

	@@ -0,0 +1,77 @@

+import numpy as np
+import torch
+from torch import Tensor
+def sample_b(sigma: float, size: tuple) -> Tensor:
+    r"""Matrix of size :attr:`size` sampled from from :math:`\mathcal{N}(0, \sigma^2)`
+    Args:
+        sigma (float): standard deviation
+        size (tuple): size of the matrix sampled
+    See :class:`~rff.layers.GaussianEncoding` for more details
+    """
+    return torch.randn(size) * sigma
+@torch.jit.script
+def gaussian_encoding(
+        v: Tensor,
+        b: Tensor) -> Tensor:
+    r"""Computes :math:`\gamma(\mathbf{v}) = (\cos{2 \pi \mathbf{B} \mathbf{v}} , \sin{2 \pi \mathbf{B} \mathbf{v}})`
+    Args:
+        v (Tensor): input tensor of shape :math:`(N, *, \text{input_size})`
+        b (Tensor): projection matrix of shape :math:`(\text{encoded_layer_size}, \text{input_size})`
+    Returns:
+        Tensor: mapped tensor of shape :math:`(N, *, 2 \cdot \text{encoded_layer_size})`
+    See :class:`~rff.layers.GaussianEncoding` for more details.
+    """
+    vp = 2 * np.pi * v @ b.T
+    return torch.cat((torch.cos(vp), torch.sin(vp)), dim=-1)
+@torch.jit.script
+def basic_encoding(
+        v: Tensor) -> Tensor:
+    r"""Computes :math:`\gamma(\mathbf{v}) = (\cos{2 \pi \mathbf{v}} , \sin{2 \pi \mathbf{v}})`
+    Args:
+        v (Tensor): input tensor of shape :math:`(N, *, \text{input_size})`
+    Returns:
+        Tensor: mapped tensor of shape :math:`(N, *, 2 \cdot \text{input_size})`
+    See :class:`~rff.layers.BasicEncoding` for more details.
+    """
+    vp = 2 * np.pi * v
+    return torch.cat((torch.cos(vp), torch.sin(vp)), dim=-1)
+@torch.jit.script
+def positional_encoding(
+        v: Tensor,
+        sigma: float,
+        m: int) -> Tensor:
+    r"""Computes :math:`\gamma(\mathbf{v}) = (\dots, \cos{2 \pi \sigma^{(j/m)} \mathbf{v}} , \sin{2 \pi \sigma^{(j/m)} \mathbf{v}}, \dots)`
+        where :math:`j \in \{0, \dots, m-1\}`
+    Args:
+        v (Tensor): input tensor of shape :math:`(N, *, \text{input_size})`
+        sigma (float): constant chosen based upon the domain of :attr:`v`
+        m (int): [description]
+    Returns:
+        Tensor: mapped tensor of shape :math:`(N, *, 2 \cdot m \cdot \text{input_size})`
+    See :class:`~rff.layers.PositionalEncoding` for more details.
+    """
+    j = torch.arange(m, device=v.device)
+    coeffs = 2 * np.pi * sigma ** (j / m)
+    vp = coeffs * torch.unsqueeze(v, -1)
+    vp_cat = torch.cat((torch.cos(vp), torch.sin(vp)), dim=-1)
+    return vp_cat.flatten(-2, -1)

src/g3/rff/layers.py ADDED Viewed

	@@ -0,0 +1,86 @@

+import torch.nn as nn
+from typing import Optional
+from torch import Tensor
+from . import functional
+class GaussianEncoding(nn.Module):
+    """Layer for mapping coordinates using random Fourier features"""
+    def __init__(self, sigma: Optional[float] = None,
+                 input_size: Optional[float] = None,
+                 encoded_size: Optional[float] = None,
+                 b: Optional[Tensor] = None):
+        r"""
+        Args:
+            sigma (Optional[float]): standard deviation
+            input_size (Optional[float]): the number of input dimensions
+            encoded_size (Optional[float]): the number of dimensions the `b` matrix maps to
+            b (Optional[Tensor], optional): Optionally specify a :attr:`b` matrix already sampled
+        Raises:
+            ValueError:
+                If :attr:`b` is provided and one of :attr:`sigma`, :attr:`input_size`,
+                or :attr:`encoded_size` is provided. If :attr:`b` is not provided and one of
+                :attr:`sigma`, :attr:`input_size`, or :attr:`encoded_size` is not provided.
+        """
+        super().__init__()
+        if b is None:
+            if sigma is None or input_size is None or encoded_size is None:
+                raise ValueError(
+                    'Arguments "sigma," "input_size," and "encoded_size" are required.')
+            b = functional.sample_b(sigma, (encoded_size, input_size))
+        elif sigma is not None or input_size is not None or encoded_size is not None:
+            raise ValueError('Only specify the "b" argument when using it.')
+        self.b = nn.parameter.Parameter(b, requires_grad=False)
+    def forward(self, v: Tensor) -> Tensor:
+        r"""Computes :math:`\gamma(\mathbf{v}) = (\cos{2 \pi \mathbf{B} \mathbf{v}} , \sin{2 \pi \mathbf{B} \mathbf{v}})`
+        Args:
+            v (Tensor): input tensor of shape :math:`(N, *, \text{input_size})`
+        Returns:
+            Tensor: Tensor mapping using random fourier features of shape :math:`(N, *, 2 \cdot \text{encoded_size})`
+        """
+        return functional.gaussian_encoding(v, self.b)
+class BasicEncoding(nn.Module):
+    """Layer for mapping coordinates using the basic encoding"""
+    def forward(self, v: Tensor) -> Tensor:
+        r"""Computes :math:`\gamma(\mathbf{v}) = (\cos{2 \pi \mathbf{v}} , \sin{2 \pi \mathbf{v}})`
+        Args:
+            v (Tensor): input tensor of shape :math:`(N, *, \text{input_size})`
+        Returns:
+            Tensor: mapped tensor of shape :math:`(N, *, 2 \cdot \text{input_size})`
+        """
+        return functional.basic_encoding(v)
+class PositionalEncoding(nn.Module):
+    """Layer for mapping coordinates using the positional encoding"""
+    def __init__(self, sigma: float, m: int):
+        r"""
+        Args:
+            sigma (float): frequency constant
+            m (int): number of frequencies to map to
+        """
+        super().__init__()
+        self.sigma = sigma
+        self.m = m
+    def forward(self, v: Tensor) -> Tensor:
+        r"""Computes :math:`\gamma(\mathbf{v}) = (\dots, \cos{2 \pi \sigma^{(j/m)} \mathbf{v}} , \sin{2 \pi \sigma^{(j/m)} \mathbf{v}}, \dots)`
+        Args:
+            v (Tensor): input tensor of shape :math:`(N, *, \text{input_size})`
+        Returns:
+            Tensor: mapped tensor of shape :math:`(N, *, 2 \cdot m \cdot \text{input_size})`
+        """
+        return functional.positional_encoding(v, self.sigma, self.m)

src/g3_batch_prediction.py ADDED Viewed

	@@ -0,0 +1,568 @@

+import asyncio
+import logging
+import os
+import shutil
+from pathlib import Path
+import time
+import numpy as np
+import torch
+import yaml
+from google import genai
+from google.genai import types
+from pydantic import ValidationError
+from tqdm.asyncio import tqdm as atqdm
+from .data_processor import DataProcessor
+from .g3.G3 import G3
+from .prompt import (
+    Evidence,
+    GPSPrediction,
+    LocationPrediction,
+    diversification_prompt,
+    location_prompt,
+    verification_prompt,
+)
+from .utils import (
+    calculate_similarity_scores,
+    extract_and_parse_json,
+    get_gps_from_location,
+    handle_async_api_call_with_retry,
+    image_to_base64,
+)
+logger = logging.getLogger("uvicorn.error")
+class G3BatchPredictor:
+    """
+    Batch prediction class for processing all images and videos in a directory.
+    This class:
+    1. Preprocesses all images and videos in a directory.
+    2. Extracts keyframes from videos and combines them with images.
+    3. Passes all keyframes and images to the Gemini model for prediction.
+    """
+    def __init__(
+        self,
+        device: str = "cuda",
+        input_dir: str = "data/input_data",
+        prompt_dir: str = "data/prompt_data",
+        cache_dir: str = "data/cache",
+        index_path: str = "data/index/G3.index",
+        hparams_path: str = "g3/hparams.yaml",
+        database_csv_path: str = "data/dataset/mp16/MP16_Pro_filtered.csv",
+        checkpoint_path: str = "data/checkpoints/mercator_finetune_weight.pth",
+    ):
+        """
+        Initialize the BatchKeyframePredictor.
+        Args:
+            checkpoint_path (str): Path to G3 model checkpoint
+            device (str): Device to run model on ("cuda" or "cpu")
+            index_path (str): Path to FAISS index for RAG (required)
+        """
+        self.device = torch.device(device)
+        self.base_path = Path(__file__).parent
+        self.checkpoint_path = self.base_path / checkpoint_path
+        self.input_dir = self.base_path / input_dir
+        self.prompt_dir = self.base_path / prompt_dir
+        self.cache_dir = self.base_path / cache_dir
+        self.image_dir = self.prompt_dir / "images"
+        self.audio_dir = self.prompt_dir / "audio"
+        os.makedirs(self.input_dir, exist_ok=True)
+        os.makedirs(self.prompt_dir, exist_ok=True)
+        os.makedirs(self.cache_dir, exist_ok=True)
+        os.makedirs(self.image_dir, exist_ok=True)
+        os.makedirs(self.audio_dir, exist_ok=True)
+        # Initialize G3 model
+        hparams = yaml.safe_load(open(self.base_path / hparams_path, "r"))
+        pe = "projection_mercator"
+        nn = "rffmlp"
+        self.model = G3(
+            device=device,
+            positional_encoding_type=pe,
+            neural_network_type=nn,
+            hparams=hparams[f"{pe}_{nn}"],
+        )
+        self.__load_checkpoint()
+        self.data_processor = DataProcessor(
+            model=self.model,
+            input_dir=self.input_dir,
+            prompt_dir=self.prompt_dir,
+            cache_dir=self.cache_dir,
+            image_dir=self.image_dir,
+            audio_dir=self.audio_dir,
+            index_path=self.base_path / index_path,
+            database_csv_path=self.base_path / database_csv_path,
+            device=self.device,
+        )
+        self.image_extension = {
+            ".jpg",
+            ".jpeg",
+            ".png",
+            ".bmp",
+            ".tiff",
+            ".tif",
+            ".webp",
+        }
+        self.video_extension = {
+            ".mp4",
+            ".avi",
+            ".mov",
+            ".mkv",
+        }
+    def __load_checkpoint(self):
+        """
+        Load the G3 model checkpoint.
+        """
+        if not os.path.exists(self.checkpoint_path):
+            raise FileNotFoundError(
+                f"Checkpoint file not found: {self.checkpoint_path}"
+            )
+        self.model.load_state_dict(
+            torch.load(self.checkpoint_path, map_location=self.device)
+        )
+        self.model.to(self.device)
+        self.model.eval()
+        logger.info(
+            f"✅ Successfully loaded G3 model checkpoint from: {self.checkpoint_path}"
+        )
+    async def llm_predict(
+        self,
+        model_name: str = "gemini-2.5-pro",
+        n_search: int | None = None,
+        n_coords: int | None = None,
+        image_prediction: bool = True,
+        text_prediction: bool = True,
+    ) -> LocationPrediction:
+        """
+        Generate a prediction using the Gemini LLM with Pydantic structured output.
+        Args:
+            model_name: LLM model name to use
+            n_search: Number of search results to include
+            n_coords: Number of coordinates to include
+            image_prediction: Whether to use images in prediction
+            text_prediction: Whether to use text in prediction
+        Returns:
+            dict: Parsed prediction response
+        """
+        prompt = diversification_prompt(
+            prompt_dir=str(self.prompt_dir),
+            n_coords=n_coords,
+            n_search=n_search,
+            image_prediction=image_prediction,
+            text_prediction=text_prediction,
+        )
+        images = []
+        if image_prediction:
+            image_dir = self.image_dir
+            if not image_dir.exists():
+                raise ValueError(f"Image directory does not exist: {image_dir}")
+            for image_file in image_dir.glob("*.jpg"):
+                with open(image_file, "rb") as f:
+                    image = types.Part.from_bytes(data=f.read(), mime_type="image/jpeg")
+                images.append(image)
+        client = genai.Client(api_key=os.environ["GOOGLE_CLOUD_API_KEY"])
+        async def api_call():
+            loop = asyncio.get_event_loop()
+            response = await loop.run_in_executor(
+                None,
+                lambda: client.models.generate_content(
+                    model=model_name,
+                    contents=[*images, prompt],
+                    config=types.GenerateContentConfig(
+                        tools=[
+                            types.Tool(url_context=types.UrlContext()),
+                        ],
+                        temperature=0.1,
+                        top_p=0.95,
+                    ),
+                ),
+            )
+            raw_text = response.text.strip() if response.text is not None else ""
+            parsed_json = extract_and_parse_json(raw_text)
+            try:
+                validated = LocationPrediction.model_validate(parsed_json)
+                return validated
+            except (ValidationError, ValueError):
+                raise ValueError("Empty or invalid LLM response")
+        return await handle_async_api_call_with_retry(
+            api_call,
+            fallback_result=LocationPrediction(
+                latitude=0.0, longitude=0.0, location="", evidence=[]
+            ),
+            error_context=f"LLM prediction with {model_name}",
+        )
+    async def diversification_predict(
+        self,
+        model_name: str = "gemini-2.5-flash",
+        image_prediction: bool = True,
+        text_prediction: bool = True,
+    ) -> LocationPrediction:
+        """
+        Diversification prediction without preprocessing (assumes preprocessing already done).
+        Runs different sample sizes in parallel for faster execution.
+        Args:
+            model_name (str): LLM model name to use
+            image_prediction (bool): Whether to use images in prediction
+            text_prediction (bool): Whether to use text in prediction
+        Returns:
+            dict: Best prediction with latitude, longitude, location, reason, and metadata
+        """
+        # Function to try a specific sample size with retry logic
+        async def try_sample_size(num_sample):
+            while True:
+                prediction = await self.llm_predict(
+                    model_name=model_name,
+                    n_search=num_sample,
+                    n_coords=num_sample,
+                    image_prediction=image_prediction,
+                    text_prediction=text_prediction,
+                )
+                if prediction:
+                    coords = (prediction.latitude, prediction.longitude)
+                    return (num_sample, coords, prediction)
+                else:
+                    logger.info(
+                        f"Invalid or empty prediction format with {num_sample} samples, retrying..."
+                    )
+        # Run all sample sizes in parallel
+        num_samples = [10, 15, 20]
+        logger.info(
+            f"🚀 Running {len(num_samples)} sample sizes in parallel: {num_samples}"
+        )
+        tasks = [try_sample_size(num_sample) for num_sample in num_samples]
+        class LW:
+            def write(self, msg: str) -> int:
+                logger.info(msg)
+                return len(msg)
+            def flush(self):
+                pass
+        results = await atqdm.gather(
+            *tasks,
+            desc="🔄 Running diversification predictions",
+            file=LW(),
+        )
+        # Build predictions dictionary from parallel results
+        predictions_dict = {}
+        for num_sample, coords, prediction in results:
+            predictions_dict[coords] = prediction
+            logger.info(f"✅ Collected prediction with {num_sample} samples: {coords}")
+        # Convert predictions to coordinate list for similarity scoring
+        predicted_coords = list(predictions_dict.keys())
+        logger.info(f"Predicted coordinates: {predicted_coords}")
+        if not predicted_coords:
+            raise ValueError("No valid predictions obtained from any sample size")
+        # Calculate similarity scores
+        avg_similarities = calculate_similarity_scores(
+            model=self.model,
+            device=self.device,
+            predicted_coords=predicted_coords,
+            image_dir=self.image_dir,
+        )
+        # Find best prediction
+        best_idx = np.argmax(avg_similarities)
+        best_coords = predicted_coords[best_idx]
+        best_prediction = predictions_dict[best_coords]
+        logger.info(f"🎯 Best prediction selected: {best_coords}")
+        logger.info(f"   Similarity scores: {avg_similarities}")
+        logger.info(f"   Best index: {best_idx}")
+        # print(json.dumps(best_prediction, indent=2))  # Commented out verbose output
+        return best_prediction
+    async def location_predict(
+        self,
+        model_name: str = "gemini-2.5-flash",
+        location: str = "specified location",
+    ) -> GPSPrediction:
+        """
+        Generate a location-based prediction using the Gemini LLM with centralized retry logic.
+        Args:
+            model_name (str): LLM model name to use
+            location (str): Location to use in the prompt
+        Returns:
+            dict: Parsed JSON prediction response
+        """
+        if not location:
+            raise ValueError("Location must be specified for location-based prediction")
+        lat, lon = get_gps_from_location(location)
+        if lat is not None and lon is not None:
+            logger.info(
+                f"Using GPS coordinates for location '{location}': ({lat}, {lon})"
+            )
+            return GPSPrediction(
+                latitude=lat, longitude=lon, analysis="", references=[]
+            )
+        else:
+            prompt = location_prompt(location)
+            client = genai.Client(api_key=os.environ["GOOGLE_CLOUD_API_KEY"])
+            async def api_call():
+                # Run the synchronous API call in a thread executor to make it truly async
+                loop = asyncio.get_event_loop()
+                response = await loop.run_in_executor(
+                    None,
+                    lambda: client.models.generate_content(
+                        model=model_name,
+                        contents=[prompt],
+                        config=types.GenerateContentConfig(
+                            tools=[
+                                types.Tool(google_search=types.GoogleSearch()),
+                            ],
+                            temperature=0.1,
+                            top_p=0.95,
+                        ),
+                    ),
+                )
+                raw_text = response.text.strip() if response.text is not None else ""
+                parsed_json = extract_and_parse_json(raw_text)
+                try:
+                    validated = GPSPrediction.model_validate(parsed_json)
+                    return validated
+                except (ValidationError, ValueError):
+                    raise ValueError("Empty or invalid LLM response")
+            return await handle_async_api_call_with_retry(
+                api_call,
+                fallback_result=GPSPrediction(
+                    latitude=0.0, longitude=0.0, analysis="", references=[]
+                ),
+                error_context=f"Location prediction for '{location}' with {model_name}",
+            )
+    async def verification_predict(
+        self,
+        prediction: LocationPrediction,
+        model_name: str = "gemini-2.5-flash",
+        image_prediction: bool = True,
+        text_prediction: bool = True,
+    ) -> LocationPrediction:
+        """
+        Generate verification prediction based on the provided prediction.
+        Args:
+            prediction (dict): Prediction dictionary with latitude, longitude, location, reason, and metadata
+            model_name (str): LLM model name to use for verification
+        Returns:
+            dict: Verification prediction with latitude, longitude, location, reason, and evidence
+        """
+        # Prepare verification data (now async)
+        satellite_image_id = await self.data_processor.prepare_location_images(
+            prediction=prediction.model_dump(),
+            image_prediction=image_prediction,
+            text_prediction=text_prediction,
+        )
+        image_dir = self.image_dir
+        images = []
+        if image_prediction:
+            if not image_dir.exists():
+                raise ValueError(f"Image directory does not exist: {image_dir}")
+            for image_file in image_dir.glob("*.jpg"):
+                with open(image_file, "rb") as f:
+                    image = types.Part.from_bytes(data=f.read(), mime_type="image/jpeg")
+                images.append(image)
+        # Prepare verification prompt
+        prompt = verification_prompt(
+            satellite_image_id=satellite_image_id,
+            prediction=prediction.model_dump(),
+            prompt_dir=str(self.prompt_dir),
+            image_prediction=image_prediction,
+            text_prediction=text_prediction,
+        )
+        client = genai.Client(api_key=os.environ["GOOGLE_CLOUD_API_KEY"])
+        async def api_call():
+            # Run the synchronous API call in a thread executor to make it truly async
+            loop = asyncio.get_event_loop()
+            response = await loop.run_in_executor(
+                None,
+                lambda: client.models.generate_content(
+                    model=model_name,
+                    contents=[*images, prompt],
+                    config=types.GenerateContentConfig(
+                        tools=[
+                            types.Tool(url_context=types.UrlContext()),
+                        ],
+                        temperature=0.1,
+                        top_p=0.95,
+                    ),
+                ),
+            )
+            raw_text = response.text.strip() if response.text is not None else ""
+            parsed_json = extract_and_parse_json(raw_text)
+            try:
+                validated = LocationPrediction.model_validate(parsed_json)
+                return validated
+            except (ValidationError, ValueError):
+                raise ValueError("Empty or invalid LLM response")
+        return await handle_async_api_call_with_retry(
+            api_call,
+            fallback_result=LocationPrediction(
+                latitude=0.0, longitude=0.0, location="", evidence=[]
+            ),
+            error_context=f"Verification prediction with {model_name}",
+        )
+    async def predict(
+        self,
+        model_name: str = "gemini-2.5-flash",
+        image_prediction: bool = True,
+        text_prediction: bool = True,
+    ) -> LocationPrediction:
+        """
+        Complete prediction pipeline without preprocessing (assumes preprocessing already done).
+        Used for parallel execution where preprocessing is done once beforehand.
+        All major steps run in parallel for maximum speed.
+        Args:
+            model_name (str): LLM model name to use
+            image_prediction (bool): Whether to use images in prediction
+            text_prediction (bool): Whether to use text in prediction
+        Returns:
+            dict: Final prediction with latitude, longitude, location, reason, and evidence
+        """
+        logger.info(
+            f"🚀 Starting multi-modal prediction pipeline with model: {model_name}"
+        )
+        await self.data_processor.preprocess_input_data()
+        # Step 1: Run diversification prediction (this is already parallel internally)
+        logger.info(
+            f"\n🔄 Running diversification prediction for Image={image_prediction}, Text={text_prediction}..."
+        )
+        diversification_result = await self.diversification_predict(
+            model_name=model_name,
+            image_prediction=image_prediction,
+            text_prediction=text_prediction,
+        )
+        # Step 2: Run location prediction
+        location_prediction = await self.location_predict(
+            model_name=model_name, location=diversification_result.location
+        )
+        logger.info("✅ Location prediction completed:")
+        # Step 3: Update coordinates and evidence from location prediction
+        result = diversification_result.model_copy()
+        result.longitude = location_prediction.longitude
+        result.latitude = location_prediction.latitude
+        # Step 4: Normalize and append location evidence
+        if location_prediction.analysis and location_prediction.references:
+            location_evidence = Evidence(
+                analysis=location_prediction.analysis,
+                references=location_prediction.references,
+            )
+        else:
+            location_evidence = Evidence(
+                analysis="No specific location analysis provided.",
+                references=[],
+            )
+        # Append to result evidence
+        result.evidence.append(location_evidence)
+        # Step 5: Run verification prediction
+        logger.info(
+            f"\n🔄 Running verification prediction for Image={image_prediction}, Text={text_prediction}..."
+        )
+        result = await self.verification_predict(
+            prediction=result,
+            model_name=model_name,
+            image_prediction=image_prediction,
+            text_prediction=text_prediction,
+        )
+        logger.info(
+            f"\n🎯 Final prediction for Image={image_prediction}, Text={text_prediction}:"
+        )
+        # print(json.dumps(result, indent=2))  # Commented out verbose output
+        return result
+    def get_response(self, prediction: LocationPrediction) -> LocationPrediction:
+        """
+        Convert image references in the prediction to base64 strings.
+        """
+        for evidence in prediction.evidence:
+            for i, ref in enumerate(evidence.references):
+                if ref.startswith("image"):
+                    evidence.references[i] = image_to_base64(self.image_dir / ref)
+        return prediction
+    def get_transcript(self) -> str:
+        """
+        Get the transcript from the transcript files in the audio directory.
+        """
+        transcript = ""
+        for transcript_file in self.audio_dir.glob("*.txt"):
+            with open(transcript_file, "r", encoding="utf-8") as f:
+                logger.info(f"Reading transcript from {transcript_file.name}")
+                transcript_data = f.read().strip()
+                if transcript_data:
+                    transcript += f"Transcript for {transcript_file.name}\n"
+                    transcript += transcript_data
+        return transcript
+    def clear_directories(self):
+        """
+        Clear the input and prompt directories.
+        """
+        delete_dirs = [self.input_dir, self.prompt_dir]
+        for dir_path in delete_dirs:
+            if os.path.exists(dir_path):
+                shutil.rmtree(dir_path)
+                logger.info(f"Deleted folder: {dir_path}")
+            else:
+                logger.info(f"Folder does not exist: {dir_path}")

src/prompt/__init__.py ADDED Viewed

	@@ -0,0 +1,6 @@

+from .factory import diversification_prompt as diversification_prompt
+from .factory import location_prompt as location_prompt
+from .factory import verification_prompt as verification_prompt
+from .factory import Evidence as Evidence
+from .factory import GPSPrediction as GPSPrediction
+from .factory import LocationPrediction as LocationPrediction

src/prompt/factory.py ADDED Viewed

	@@ -0,0 +1,418 @@

+import json
+import os
+from pydantic import BaseModel
+from .template import DIVERSIFICATION_PROMPT, LOCATION_PROMPT, VERIFICATION_PROMPT
+class Evidence(BaseModel):
+    analysis: str
+    references: list[str] = []
+class LocationPrediction(BaseModel):
+    latitude: float
+    longitude: float
+    location: str
+    evidence: list[Evidence]
+class GPSPrediction(BaseModel):
+    latitude: float
+    longitude: float
+    analysis: str
+    references: list[str]
+def rag_prompt(index_search_json: str, n_coords: int | None = None) -> str:
+    """
+    Creates a formatted string with GPS coordinates for similar and dissimilar images.
+    Args:
+        candidates_gps (list[tuple]): List of (lat, lon) tuples for similar images.
+        reverse_gps (list[tuple]): List of (lat, lon) tuples for dissimilar images.
+        n_coords (int, optional): Number of coords to include from each list. Defaults to all.
+    Returns:
+        str: Formatted string with coordinates for reference.
+    """
+    if not os.path.exists(index_search_json):
+        return ""
+    with open(index_search_json, "r", encoding="utf-8") as file:
+        data = json.load(file)
+    candidates_gps = data.get("candidates_gps", [])
+    reverse_gps = data.get("reverse_gps", [])
+    if n_coords is not None:
+        candidates_gps = candidates_gps[: min(n_coords, len(candidates_gps))]
+        reverse_gps = reverse_gps[: min(n_coords, len(reverse_gps))]
+    else:
+        candidates_gps = candidates_gps
+        reverse_gps = reverse_gps
+    candidates_str = (
+        "[" + ", ".join(f"[{lat}, {lon}]" for (lat, lon) in candidates_gps) + "]"
+    )
+    reverse_str = "[" + ", ".join(f"[{lat}, {lon}]" for (lat, lon) in reverse_gps) + "]"
+    return f"For your reference, these are coordinates of some similar images: {candidates_str}, and these are coordinates of some dissimilar images: {reverse_str}."
+def metadata_prompt(metadata_file_path: str) -> str:
+    """
+    Reads a metadata JSON file and returns a formatted string combining all fields.
+    Args:
+        metadata_file_path (str): Path to the metadata JSON file
+    Returns:
+        str: Formatted string with all metadata fields combined
+    """
+    if not metadata_file_path or not os.path.exists(metadata_file_path):
+        return ""
+    try:
+        with open(metadata_file_path, "r", encoding="utf-8") as file:
+            metadata = json.load(file)
+        if not metadata:
+            return ""
+        metadata_parts = []
+        if "location" in metadata and metadata["location"]:
+            metadata_parts.append(f"Location: {metadata['location']}")
+        if "violence level" in metadata and metadata["violence level"]:
+            metadata_parts.append(f"Violence level: {metadata['violence level']}")
+        if "title" in metadata and metadata["title"]:
+            metadata_parts.append(f"Title: {metadata['title']}")
+        if "social media link" in metadata and metadata["social media link"]:
+            metadata_parts.append(f"Social media link: {metadata['social media link']}")
+        if "description" in metadata and metadata["description"]:
+            metadata_parts.append(f"Description: {metadata['description']}")
+        if "category" in metadata and metadata["category"]:
+            metadata_parts.append(f"Category: {metadata['category']}")
+        if not metadata_parts:
+            return ""
+        metadata_string = "Metadata for the image is: "
+        return metadata_string + ". ".join(metadata_parts) + "."
+    except Exception:
+        return ""
+def search_prompt(search_candidates: list[str], n_search: int | None = None) -> str:
+    """
+    Formats search candidate links into a prompt string.
+    Args:
+        search_candidates (list[str]): List of candidate URLs from image search
+        n_search (int): Number of results to include (default: 5)
+    Returns:
+        str: Formatted string with candidate links, each on a new line
+    Example:
+        >>> candidates = search_prompt(["https://example1.com", "https://example2.com"], n_search=3)
+        >>> print(candidates)
+        Similar image can be found in those links:
+        https://example1.com
+        https://example2.com
+    """
+    if not search_candidates or not isinstance(search_candidates, list):
+        return ""
+    EXCLUDE_DOMAINS = [
+        "x.com",
+        "twitter.com",
+        "linkedin.com",
+        "bbc.com",
+        "bbc.co.uk",
+        "instagram.com",
+        "tiktok.com",
+    ]
+    for domain in EXCLUDE_DOMAINS:
+        search_candidates = [url for url in search_candidates if domain not in url]
+    if n_search is not None:
+        search_candidates = search_candidates[: min(n_search, len(search_candidates))]
+    try:
+        prompt = "\n".join(search_candidates)
+        return prompt
+    except Exception:
+        return ""
+def image_search_prompt(image_search_json: str, n_search: int | None = None) -> str:
+    """
+    Reads all JSON files in the base directory's image_search folder and combines links.
+    Args:
+        base_dir (str): Path to the base directory containing image search JSON files
+    Returns:
+        str: Combined search prompt string
+    """
+    pages_with_matching_images = set()
+    full_matching_images = set()
+    partial_matching_images = set()
+    with open(image_search_json, "r", encoding="utf-8") as file:
+        data_list = json.load(file)
+        for json_data in data_list:
+            if "pages_with_matching_images" in json_data:
+                pages_with_matching_images.update(
+                    json_data["pages_with_matching_images"]
+                )
+            elif "full_matching_images" in json_data:
+                full_matching_images.update(json_data["full_matching_images"])
+            elif "partial_matching_images" in json_data:
+                partial_matching_images.update(json_data["partial_matching_images"])
+    if (
+        not pages_with_matching_images
+        and not full_matching_images
+        and not partial_matching_images
+    ):
+        return ""
+    prompt = "Those are pages with matching images:\n"
+    prompt += search_prompt(list(pages_with_matching_images), n_search=n_search)
+    # prompt += "\n\nThose are full matching images:\n"
+    # prompt += search_prompt(list(full_matching_images), n_search=n_search)
+    # prompt += "\n\nThose are partial matching images:\n"
+    # prompt += search_prompt(list(partial_matching_images), n_search=n_search)
+    return prompt
+def search_content_prompt(search_content_json: str) -> str:
+    """
+    Reads a JSON file containing search content and returns a formatted string.
+    Args:
+        search_content_json (str): Path to the JSON file with search content
+    Returns:
+        str: Formatted string with all search content links
+    """
+    if not os.path.exists(search_content_json):
+        return ""
+    try:
+        with open(search_content_json, "r", encoding="utf-8") as file:
+            data = json.load(file)
+        if not data or not isinstance(data, list):
+            return ""
+        prompt = json.dumps(data, indent=2)
+        return prompt
+    except Exception:
+        return ""
+def transcript_prompt(audio_dir: str) -> str:
+    """
+    Reads all transcript text files in the audio directory and returns a formatted string.
+    Args:
+        audio_dir (str): Path to the audio directory containing transcript files
+    Returns:
+        str: Combined transcript content formatted as a prompt
+    """
+    if not os.path.exists(audio_dir):
+        return ""
+    transcript_content = []
+    for txt_file in os.listdir(audio_dir):
+        if txt_file.endswith(".txt"):
+            txt_path = os.path.join(audio_dir, txt_file)
+            with open(txt_path, "r", encoding="utf-8") as file:
+                transcript_content.append(file.read().strip())
+    combined_transcript = "\n".join(transcript_content)
+    return (
+        f"This is the transcript of the video: {combined_transcript}"
+        if combined_transcript
+        else ""
+    )
+def combine_prompt_data(
+    prompt_dir: str,
+    n_search: int | None = None,
+    n_coords: int | None = None,
+    image_prediction: bool = True,
+    text_prediction: bool = True,
+) -> str:
+    """
+    Combines all prompt data into one comprehensive prompt string.
+    Args:
+        base_dir (str): Path to the base directory
+        candidates_gps (list[tuple]): GPS coordinates for similar images (for RAG)
+        reverse_gps (list[tuple]): GPS coordinates for dissimilar images (for RAG)
+        n_search (int): Number of search results to include (default: 5)
+        n_coords (int, optional): Number of coordinates to include in RAG
+    Returns:
+        str: Combined prompt string
+    Example:
+        >>> prompt = combine_prompts(
+        ...     base_dir="path/to/base_dir",
+        ...     candidates_gps=[(40.7128, -74.0060)],
+        ...     reverse_gps=[(51.5074, -0.1278)]
+        ... )
+    """
+    prompt_parts = []
+    # 1. RAG prompt (optional)
+    if n_coords is not None:
+        rag_text = rag_prompt(os.path.join(prompt_dir, "index_search.json"), n_coords)
+        prompt_parts.append(rag_text)
+    # 2. Metadata prompt
+    if text_prediction:
+        metadata_text = metadata_prompt(os.path.join(prompt_dir, "metadata.json"))
+        if metadata_text:
+            prompt_parts.append(metadata_text)
+    # 3. Search prompt
+    if image_prediction:
+        image_search_text = search_content_prompt(
+            os.path.join(prompt_dir, "image_search_content.json")
+        )
+        if image_search_text:
+            prompt_parts.append(image_search_text)
+    if text_prediction:
+        search_content_text = search_content_prompt(
+            os.path.join(prompt_dir, "text_search_content.json")
+        )
+        if search_content_text:
+            prompt_parts.append(search_content_text)
+    # 4. Transcript prompt
+    transcript_text = transcript_prompt(os.path.join(prompt_dir, "audio"))
+    if transcript_text:
+        prompt_parts.append(transcript_text)
+    # Combine all parts with double newlines for readability
+    combined_prompt = "\n\n".join(part for part in prompt_parts if part.strip())
+    return combined_prompt
+def diversification_prompt(
+    prompt_dir: str,
+    n_search: int | None = None,
+    n_coords: int | None = None,
+    image_prediction: bool = True,
+    text_prediction: bool = True,
+) -> str:
+    """
+    Combines all prompts into one comprehensive prompt string.
+    Args:
+        base_dir (str): Path to the base directory
+        candidates_gps (list[tuple]): GPS coordinates for similar images (for RAG)
+        reverse_gps (list[tuple]): GPS coordinates for dissimilar images (for RAG)
+        n_search (int): Number of search results to include (default: 5)
+        n_coords (int, optional): Number of coordinates to include in RAG
+    Returns:
+        str: Combined prompt string
+    Example:
+        >>> prompt = combine_prompts(
+        ...     base_dir="path/to/base_dir",
+        ...     candidates_gps=[(40.7128, -74.0060)],
+        ...     reverse_gps=[(51.5074, -0.1278)]
+        ... )
+    """
+    prompt_data = combine_prompt_data(
+        prompt_dir,
+        n_search=n_search,
+        n_coords=n_coords,
+        image_prediction=image_prediction,
+        text_prediction=text_prediction,
+    )
+    prompt = DIVERSIFICATION_PROMPT.strip().format(prompt_data=prompt_data)
+    return prompt
+def location_prompt(location: str) -> str:
+    """
+    Creates a prompt string for the given location.
+    Args:
+        location (str): The location to include in the prompt.
+    Returns:
+        str: Formatted string with the location.
+    """
+    if not location:
+        return ""
+    prompt = LOCATION_PROMPT.strip()
+    prompt = prompt.format(location=location)
+    return prompt
+def verification_prompt(
+    satellite_image_id: int,
+    prediction: dict,
+    prompt_dir: str,
+    n_search: int | None = None,
+    n_coords: int | None = None,
+    image_prediction: bool = True,
+    text_prediction: bool = True,
+) -> str:
+    """
+    Creates a verification prompt string with the provided data and prediction.
+    Args:
+        prompt_data (str): The prompt data to include.
+        prediction (str): The prediction to verify.
+    Returns:
+        str: Formatted verification prompt string.
+    """
+    prompt_data = combine_prompt_data(
+        prompt_dir,
+        n_search=n_search,
+        n_coords=n_coords,
+        image_prediction=image_prediction,
+        text_prediction=text_prediction,
+    )
+    prompt = VERIFICATION_PROMPT.strip().format(
+        prompt_data=prompt_data,
+        prediction=json.dumps(prediction, indent=2),
+        satellite_image_id=f"{satellite_image_id:03d}",
+    )
+    return prompt

src/prompt/fetch/content_fetch.py ADDED Viewed

	@@ -0,0 +1,171 @@

+import asyncio
+import json
+import logging
+from pathlib import Path
+from typing import Literal, TypedDict
+from playwright.async_api import Page, async_playwright
+READABILITY_JS_URL = "https://unpkg.com/@mozilla/readability@0.4.4/Readability.js"
+logger = logging.getLogger("uvicorn.error")
+class PageText(TypedDict):
+    url: str
+    text: str
+WaitUntil = Literal["load", "domcontentloaded", "networkidle", "commit"]
+async def _inject_readability(page: Page) -> None:
+    is_html = await page.evaluate("() => document.documentElement.nodeName === 'HTML'")
+    if not is_html:
+        return
+    await page.add_script_tag(url=READABILITY_JS_URL)
+    await page.add_script_tag(
+        content="window.__readability__ = new Readability(document.cloneNode(true));"
+    )
+async def _fetch_text(page: Page, url: str, wait_until: WaitUntil) -> str:
+    await page.goto(url, wait_until=wait_until)
+    await page.wait_for_timeout(1000)
+    # Attempt Readability.js parsing first
+    try:
+        await _inject_readability(page)
+        readability_text = await page.evaluate(
+            "() => window.__readability__.parse()?.textContent"
+        )
+        if readability_text:
+            return readability_text.strip()
+    except BaseException as _:
+        pass
+    # Fallback: Twitter specific logic
+    try:
+        tweet_text = await page.locator(
+            "article div[data-testid='tweetText']"
+        ).all_inner_texts()
+        if tweet_text:
+            return "\n".join(tweet_text)
+    except BaseException as _:
+        pass
+    # Final fallback: full body text
+    return await page.evaluate("() => document.body.innerText")
+async def fetch_text(
+    url: str, headless: bool = False, wait_until: WaitUntil = "load"
+) -> PageText:
+    async with async_playwright() as pw:
+        browser = await pw.chromium.launch_persistent_context(
+            user_data_dir="",
+            channel="chrome",
+            headless=headless,
+            no_viewport=True,
+        )
+        page = await browser.new_page()
+        text = await _fetch_text(page, url, wait_until)
+        await browser.close()
+    return PageText(url=url, text=text)
+async def fetch_texts(
+    urls: list[str], headless: bool = False, wait_until: WaitUntil = "load"
+) -> list[PageText | BaseException]:
+    async with async_playwright() as pw:
+        browser = await pw.chromium.launch_persistent_context(
+            user_data_dir="",
+            channel="chrome",
+            headless=True,
+            no_viewport=True,
+        )
+        # browser = await pw.chromium.launch_persistent_context(
+        #     user_data_dir="/tmp/playwright_profile",
+        #     headless=True,
+        #     no_viewport=True,
+        # )
+        pages = [await browser.new_page() for _ in urls]
+        tasks = [_fetch_text(page, url, wait_until) for page, url in zip(pages, urls)]
+        results_raw = await asyncio.gather(*tasks, return_exceptions=True)
+        await browser.close()
+    results: list[PageText | BaseException] = []
+    for url, result in zip(urls, results_raw):
+        if isinstance(result, BaseException):
+            results.append(result)
+        else:
+            results.append(PageText(url=url, text=result))
+    return results
+async def fetch_links_to_json(
+    links: list[str],
+    output_path: str,
+    headless: bool = False,
+    wait_until: WaitUntil = "load",
+    max_content_length: int = 5000,
+) -> None:
+    """
+    Fetch content from a list of links and save to a JSON file.
+    Args:
+        links: List of URLs to fetch content from
+        output_path: Path where the JSON file will be saved
+        headless: Whether to run browser in headless mode
+        wait_until: When to consider page loading complete
+        max_content_length: Maximum number of characters to keep from each page content
+    Returns:
+        None (saves results to JSON file)
+    """
+    logger.info(f"📥 Fetching content from {len(links)} links...")
+    # Fetch content from all links
+    results = await fetch_texts(links, headless=headless, wait_until=wait_until)
+    # Process results into the desired format
+    json_data = []
+    for i, (link, result) in enumerate(zip(links, results)):
+        logger.info(f"  Processing {i + 1}/{len(links)}: {link}")
+        if isinstance(result, BaseException):
+            # Handle errors gracefully
+            json_data.append({"link": link, "content": "Fail to fetch content..."})
+        else:
+            # Successfully fetched content - apply length limit
+            content = result["text"]
+            if len(content) > max_content_length:
+                content = (
+                    content[:max_content_length]
+                    + "... [content truncated due to length limit]"
+                )
+                logger.info(
+                    f"✂️ Content truncated from {len(result['text'])} to {max_content_length} characters"
+                )
+            json_data.append({"link": link, "content": content})
+    # Ensure output directory exists
+    output_file = Path(output_path)
+    output_file.parent.mkdir(parents=True, exist_ok=True)
+    # Save to JSON file
+    with open(output_file, "w", encoding="utf-8") as f:
+        json.dump(json_data, f, ensure_ascii=False, indent=2)
+    logger.info(f"💾 Saved content from {len(links)} links to {output_path}")
+    # Print summary
+    successful = sum(
+        1 for item in json_data if not item["content"].startswith("Error fetching")
+    )
+    failed = len(json_data) - successful
+    logger.info(f"📊 Summary: {successful} successful, {failed} failed")

src/prompt/fetch/satellite_fetch.py ADDED Viewed

	@@ -0,0 +1,87 @@

+import logging
+import httpx
+from geopy import Point
+from geopy.distance import distance
+logger = logging.getLogger("uvicorn.error")
+def meter_offsets(lat: float, lon: float, extend: float) -> tuple[float, float]:
+    """
+    Returns (lat_offset, lon_offset) in degrees for a given
+    center point (lat, lon) and radial distance in meters (extend).
+    """
+    origin = Point(lat, lon)
+    # Move north (bearing=0°) and east (bearing=90°)
+    north = distance(meters=extend).destination(origin, bearing=0)
+    east = distance(meters=extend).destination(origin, bearing=90)
+    return north.latitude - lat, east.longitude - lon
+def fetch_satellite_image(
+    lat: float, lon: float, extend: float, output_path: str = "esri_sat.png"
+) -> None:
+    """
+    Fetches a satellite PNG from Esri's World Imagery service.
+    Parameters:
+    - lat: Latitude of the center point (decimal degrees).
+    - lon: Longitude of the center point (decimal degrees).
+    - extend: Buffer distance from center in meters (radius).
+    - output_path: File path to save the resulting PNG.
+    Attempts the highest resolution (1024x1024) first,
+    halving the dimensions on failure until success.
+    Retries up to 3 times if all size attempts fail.
+    """
+    # Compute lat/lon degree offsets using geopy
+    lat_offset, lon_offset = meter_offsets(lat, lon, extend)
+    # Compute bounding box in lon/lat
+    minx = lon - lon_offset
+    miny = lat - lat_offset
+    maxx = lon + lon_offset
+    maxy = lat + lat_offset
+    base_url = (
+        "https://server.arcgisonline.com/ArcGIS/rest/services/"
+        "World_Imagery/MapServer/export"
+    )
+    # Retry up to 3 times
+    for attempt in range(3):
+        logger.info(f"Attempt {attempt + 1}/3 to fetch satellite image...")
+        # Try descending sizes until success
+        size = 1024
+        while size >= 128:
+            params = {
+                "bbox": f"{minx},{miny},{maxx},{maxy}",
+                "bboxSR": "4326",
+                "size": f"{size},{size}",
+                "format": "png",
+                "f": "image",
+            }
+            try:
+                response = httpx.get(base_url, params=params, timeout=30.0)
+                if response.status_code == 200:
+                    with open(output_path, "wb") as f:
+                        f.write(response.content)
+                    logger.info(f"Saved Esri image to {output_path} ({size}x{size})")
+                    return
+                else:
+                    logger.info(
+                        f"Failed at size {size} (status {response.status_code}), trying {size // 2}"
+                    )
+                    size //= 2
+            except Exception as e:
+                logger.error(f"Network error at size {size}: {e}, trying {size // 2}")
+                size //= 2
+        # If this attempt failed for all sizes, log and continue to next attempt
+        if attempt < 2:  # Don't print this message on the last attempt
+            logger.info(f"Attempt {attempt + 1} failed for all sizes, retrying...")
+    # If all attempts fail
+    logger.warning("Unable to fetch Esri imagery: all retry attempts failed.")

src/prompt/preprocess/keyframe_extract.py ADDED Viewed

	@@ -0,0 +1,195 @@

+import logging
+import os
+import cv2
+import numpy as np
+from google.cloud import videointelligence_v1 as vi
+from scipy.spatial.distance import cdist
+from sklearn.metrics import silhouette_score
+# Set up logger
+logger = logging.getLogger("uvicorn.error")
+def detect_shot_intervals_local(video_path: str) -> list[tuple[float, float]]:
+    logger.info(f"Detecting shot intervals for video: {video_path}")
+    client = vi.VideoIntelligenceServiceClient()
+    with open(video_path, "rb") as f:
+        input_content = f.read()
+    op = client.annotate_video(
+        request={
+            "input_content": input_content,
+            "features": [vi.Feature.SHOT_CHANGE_DETECTION],
+        }
+    )
+    response = op.result(timeout=300)
+    if not response or not response.annotation_results:
+        logger.error("No annotation_results found in video intelligence response.")
+        return []
+    result = response.annotation_results[0]
+    intervals = []
+    for shot in result.shot_annotations:
+        start = (
+            shot.start_time_offset.seconds + shot.start_time_offset.microseconds / 1e6
+        )
+        end = shot.end_time_offset.seconds + shot.end_time_offset.microseconds / 1e6
+        intervals.append((start, end))
+    logger.info(f"Detected {len(intervals)} shot intervals.")
+    return intervals
+def color_histogram(img: np.ndarray) -> np.ndarray:
+    hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)
+    hist = cv2.calcHist([hsv], [0, 1, 2], None, [8, 8, 8], [0, 180, 0, 256, 0, 256])
+    return cv2.normalize(hist, hist).flatten()
+def sample_frames_per_shot(
+    video_path: str, start: float, end: float, step: float = 1.0
+) -> list[np.ndarray]:
+    # logger.info(f"Sampling frames from {start:.2f}s to {end:.2f}s every {step:.2f}s")
+    cap = cv2.VideoCapture(video_path)
+    frames = []
+    t = start
+    while t < end:
+        cap.set(cv2.CAP_PROP_POS_MSEC, t * 1000)
+        ret, frame = cap.read()
+        if not ret:
+            logger.warning(f"Failed to read frame at {t:.2f}s")
+            break
+        frames.append(frame)
+        t += step
+    cap.release()
+    # logger.info(f"Sampled {len(frames)} frames for shot interval.")
+    return frames
+def kmeans_init(features: np.ndarray):
+    n, _ = features.shape
+    k = int(np.sqrt(n)) or 1
+    idx = np.random.choice(n, k, replace=False)
+    centers = features[idx]
+    clusters = np.argmin(cdist(features, centers), axis=1)
+    return clusters, centers
+def kmeans_silhouette(features: np.ndarray):
+    k = max(int(np.sqrt(len(features))), 2)
+    best_k, best_score = k, -1
+    clusters, centers = kmeans_init(features)
+    best_centers = centers.copy()
+    while k > 2:
+        d = cdist(centers, centers)
+        np.fill_diagonal(d, np.inf)
+        i, j = np.unravel_index(np.argmin(d), d.shape)
+        clusters = np.where(clusters == j, i, clusters)
+        clusters = np.where(clusters > j, clusters - 1, clusters)
+        new_centers = []
+        for cid in range(k - 1):
+            cluster_feats = features[clusters == cid]
+            if cluster_feats.size == 0:
+                continue
+            mean_vec = np.mean(cluster_feats, axis=0)
+            idx_close = np.argmin(np.linalg.norm(cluster_feats - mean_vec, axis=1))
+            new_centers.append(cluster_feats[idx_close])
+        centers = new_centers
+        k -= 1
+        if len(np.unique(clusters)) > 1:
+            score = silhouette_score(features, clusters)
+            if score > best_score:
+                best_score, best_k = score, k
+                best_centers = centers.copy()
+    center_indices = []
+    for c in best_centers:
+        matches = np.where((features == c).all(axis=1))[0]
+        if matches.size > 0:
+            center_indices.append(int(matches[0]))
+    # logger.info(f"KMeans silhouette: best_k={best_k}, best_score={best_score:.4f}")
+    return best_k, best_centers, center_indices
+def redundancy_filter(
+    video_path: str, indices: list[int], threshold: float
+) -> list[int]:
+    # logger.info(f"Filtering redundant frames with threshold {threshold}")
+    histos = []
+    cap = cv2.VideoCapture(video_path)
+    for idx in indices:
+        cap.set(cv2.CAP_PROP_POS_FRAMES, idx)
+        ret, frame = cap.read()
+        if ret:
+            histos.append(color_histogram(frame))
+    cap.release()
+    keep = []
+    for i, h in enumerate(histos):
+        if not any(
+            np.dot(h, nh) / (np.linalg.norm(h) * np.linalg.norm(nh)) > threshold
+            for nh in histos[:i]
+        ):
+            keep.append(indices[i])
+    # logger.info(f"Filtered down to {len(keep)} non-redundant frames.")
+    return keep
+def extract_and_save_keyframes(
+    video_path: str,
+    output_dir: str,
+    start_index: int = 0,
+    step: float = 1.0,
+    threshold: float = 0.7,
+    k_min: int = 2,
+    k_max: int = 8,
+) -> int:
+    logger.info(f"Starting keyframe extraction for {video_path}")
+    os.makedirs(output_dir, exist_ok=True)
+    # Get FPS to convert seconds to frame indices
+    cap_meta = cv2.VideoCapture(video_path)
+    video_fps = cap_meta.get(cv2.CAP_PROP_FPS) or 1.0
+    cap_meta.release()
+    intervals = detect_shot_intervals_local(video_path)
+    cap = cv2.VideoCapture(video_path)
+    output_idx = start_index
+    for shot_idx, (start, end) in enumerate(intervals):
+        # logger.info(
+        #     f"Processing shot {shot_idx + 1}/{len(intervals)}: {start:.2f}s to {end:.2f}s"
+        # )
+        # Sample frames & extract features
+        frames = sample_frames_per_shot(video_path, start, end, step)
+        feats = (
+            np.vstack([color_histogram(f) for f in frames])
+            if frames
+            else np.empty((0,))
+        )
+        # Determine intra-shot keyframe indices
+        if feats.size < k_min or feats.ndim == 1:
+            idxs = list(range(len(frames)))
+        else:
+            _, centers, cidxs = kmeans_silhouette(feats)
+            idxs = cidxs
+        # Map to global frame numbers and dedupe
+        global_idxs = [int(start * video_fps) + i for i in idxs]
+        filtered = redundancy_filter(video_path, global_idxs, threshold)
+        # Save each keyframe sequentially into output_dir
+        for frame_no in filtered:
+            cap.set(cv2.CAP_PROP_POS_FRAMES, frame_no)
+            ret, frame = cap.read()
+            if not ret:
+                continue
+            out_path = os.path.join(output_dir, f"image_{output_idx:03d}.jpg")
+            cv2.imwrite(out_path, frame)
+            output_idx += 1
+        logger.info(
+            f"Shot {shot_idx + 1}: saved {len(filtered)} keyframes. Total so far: {output_idx}"
+        )
+    cap.release()
+    logger.info(f"Extraction complete. Total frames saved: {output_idx}")
+    return output_idx

src/prompt/preprocess/video_transcribe.py ADDED Viewed

	@@ -0,0 +1,132 @@

+import logging
+import os
+import shutil
+import tempfile
+from pathlib import Path
+from typing import Dict, List, Optional, Tuple
+import cv2
+import numpy as np
+import torch
+import torchvision.transforms as T
+import whisper
+from moviepy import *
+from open_clip import create_model_and_transforms
+logger = logging.getLogger("uvicorn.error")
+def extract_audio(video_path: str, output_dir: str) -> str:
+    """
+    Extract audio from video and save as WAV file.
+    Args:
+        video_path (str): Path to input video file.
+        output_dir (str): Directory to save the audio file.
+    Returns:
+        str: Path to the saved audio file.
+    """
+    video_name = Path(video_path).stem
+    audio_path = Path(output_dir) / f"{video_name}.wav"
+    try:
+        video = VideoFileClip(str(video_path))
+        audio = video.audio
+        if audio is not None:
+            audio.write_audiofile(str(audio_path), logger="bar")
+            audio.close()
+        video.close()
+        if not audio_path.exists():
+            raise RuntimeError("Audio file was not created")
+        return str(audio_path)
+    except Exception as e:
+        logger.error(f"Error extracting audio: {str(e)}")
+        return ""
+def transcribe_audio(audio_path: str, model_name: str = "base") -> str:
+    """
+    Transcribe audio using Whisper.
+    Args:
+        audio_path (str): Path to the audio file.
+        model_name (str): Whisper model name.
+    Returns:
+        str: Transcription text.
+    """
+    try:
+        model = whisper.load_model(model_name)
+        result = model.transcribe(str(audio_path), fp16=False, verbose=False)
+        return str(result.get("text", "")).strip()
+    except Exception as e:
+        raise RuntimeError(f"Error transcribing audio: {str(e)}")
+def transcribe_video(
+    video_path: str,
+    output_dir: str = "g3/data/prompt_data/audio",
+    model_name: str = "base",
+):
+    """
+    Transcribe video by extracting audio and then transcribing it.
+    Args:
+        video_path (str): Path to the video file.
+        output_dir (str): Directory to save the audio file.
+        model_name (str): Whisper model name.
+    Returns:
+        str: Path to the saved transcription text file.
+    """
+    audio_path = extract_audio(video_path, output_dir)
+    if not audio_path:
+        logger.error("Audio extraction failed. No audio file created.")
+        return
+    logger.info(f"Audio extracted to: {audio_path}")
+    transcript_text = transcribe_audio(audio_path, model_name=model_name)
+    transcript_path = Path(output_dir) / f"{Path(video_path).stem}_transcript.txt"
+    with open(transcript_path, "w", encoding="utf-8") as f:
+        f.write(transcript_text)
+    logger.info(f"Transcript saved to: {transcript_path}")
+def transcribe_video_directory(
+    video_dir: str,
+    output_dir: str = "g3/data/prompt_data/audio",
+    model_name: str = "base",
+):
+    """
+    Transcribe all videos in a directory.
+    Args:
+        video_dir (str): Directory containing video files.
+        output_dir (str): Directory to save the audio and transcript files.
+        model_name (str): Whisper model name.
+    Returns:
+        None
+    """
+    video_extensions = {".mp4", ".avi", ".mov", ".mkv"}
+    os.makedirs(output_dir, exist_ok=True)
+    video_files = [
+        f
+        for f in Path(video_dir).glob("*")
+        if f.is_file() and f.suffix.lower() in video_extensions
+    ]
+    if not video_files:
+        logger.info(f"No video files found in directory: {video_dir}")
+    for video_file in video_files:
+        logger.info(f"Processing video: {video_file}")
+        transcribe_video(str(video_file), output_dir, model_name=model_name)

src/prompt/search/image_search.py ADDED Viewed

	@@ -0,0 +1,527 @@

+import base64
+import json
+import logging
+import os
+import time
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from pathlib import Path
+from threading import Lock
+import requests
+from google.cloud import vision
+from requests.adapters import HTTPAdapter
+from urllib3.util.retry import Retry
+logger = logging.getLogger("uvicorn.error")
+# GOOGLE CLOUD VISION API
+def annotate(path: str) -> vision.WebDetection:
+    """Returns web annotations given the path to an image.
+    Args:
+        path: path to the input image.
+    Returns:
+        An WebDetection object with relevant information of the
+        image from the internet (i.e., the annotations).
+    """
+    client = vision.ImageAnnotatorClient()
+    if path.startswith("http") or path.startswith("gs:"):
+        image = vision.Image()
+        image.source.image_uri = path
+    else:
+        with open(path, "rb") as image_file:
+            content = image_file.read()
+        image = vision.Image(content=content)
+    response = client.annotate_image(
+        {
+            "image": image,
+            "features": [{"type_": vision.Feature.Type.WEB_DETECTION}],
+        }
+    )
+    return response.web_detection
+def annotate_directory(directory: str) -> list[vision.WebDetection]:
+    """
+    Perform web detection on all image files in the given directory in batches of 16.
+    Args:
+        directory (str): Path to the directory containing image files.
+    Returns:
+        list[vision.WebDetection]: List of WebDetection objects for each image.
+    """
+    client = vision.ImageAnnotatorClient()
+    # Collect all image files first
+    image_files = []
+    for file_name in os.listdir(directory):
+        file_path = os.path.join(directory, file_name)
+        if os.path.isfile(file_path) and file_name.lower().endswith(
+            (".jpg", ".jpeg", ".png", ".bmp", ".gif")
+        ):
+            image_files.append(file_path)
+    all_web_detections = []
+    batch_size = 16  # Google Vision API batch limit
+    # Process images in batches of 16
+    for i in range(0, len(image_files), batch_size):
+        batch_files = image_files[i : i + batch_size]
+        logger.info(
+            f"Processing batch {i // batch_size + 1}/{(len(image_files) + batch_size - 1) // batch_size} ({len(batch_files)} images)..."
+        )
+        # Prepare batch requests
+        image_requests = []
+        for file_path in batch_files:
+            try:
+                with open(file_path, "rb") as image_file:
+                    content = image_file.read()
+                    image = vision.Image(content=content)
+                    image_requests.append(image)
+            except Exception as e:
+                logger.warning(f"⚠️ Failed to read image {file_path}: {e}")
+                # Add a placeholder to maintain order
+                image_requests.append(None)
+        # Filter out None values and keep track of valid indices
+        valid_requests = []
+        valid_indices = []
+        for idx, request in enumerate(image_requests):
+            if request is not None:
+                valid_requests.append(request)
+                valid_indices.append(idx)
+        if not valid_requests:
+            logger.warning(f"⚠️ No valid images in batch {i // batch_size + 1}")
+            continue
+        try:
+            # Make batch API call
+            responses = client.batch_annotate_images(
+                requests=[
+                    vision.AnnotateImageRequest(
+                        image=image,
+                        features=[
+                            vision.Feature(type=vision.Feature.Type.WEB_DETECTION)
+                        ],
+                    )
+                    for image in valid_requests
+                ]
+            ).responses
+            # Process responses and maintain order
+            batch_detections: list[vision.WebDetection | None] = [None] * len(
+                batch_files
+            )
+            for response_idx, global_idx in enumerate(valid_indices):
+                if (
+                    response_idx < len(responses)
+                    and responses[response_idx].web_detection
+                ):
+                    batch_detections[global_idx] = responses[response_idx].web_detection
+            # Add to results (filter out None values)
+            all_web_detections.extend(
+                [det for det in batch_detections if det is not None]
+            )
+        except Exception as e:
+            logger.warning(f"⚠️ Batch {i // batch_size + 1} failed: {e}")
+            continue
+    logger.info(
+        f"✅ Successfully processed {len(all_web_detections)} images out of {len(image_files)} total"
+    )
+    return all_web_detections
+def parse_web_detection(annotations: vision.WebDetection) -> dict:
+    """Returns detected features in the provided web annotations as a dict."""
+    result = {
+        "pages_with_matching_images": [],
+        "full_matching_images": [],
+        "partial_matching_images": [],
+        "web_entities": [],
+    }
+    if annotations.pages_with_matching_images:
+        for page in annotations.pages_with_matching_images:
+            result["pages_with_matching_images"].append(page.url)
+    if annotations.full_matching_images:
+        for image in annotations.full_matching_images:
+            result["full_matching_images"].append(image.url)
+    if annotations.partial_matching_images:
+        for image in annotations.partial_matching_images:
+            result["partial_matching_images"].append(image.url)
+    if annotations.web_entities:
+        for entity in annotations.web_entities:
+            result["web_entities"].append(
+                {"score": entity.score, "description": entity.description}
+            )
+    return result
+def get_image_links_vision(annotations: vision.WebDetection) -> list[str]:
+    """Extracts image links from web detection annotations."""
+    links = []
+    if annotations.pages_with_matching_images:
+        for page in annotations.pages_with_matching_images:
+            links.append(page.url)
+    if not links and annotations.full_matching_images:
+        # Fallback to full matching images if no pages found
+        for image in annotations.full_matching_images:
+            links.append(image.url)
+    if not links and annotations.partial_matching_images:
+        # Fallback to partial matching images if no full matches found
+        for image in annotations.partial_matching_images:
+            links.append(image.url)
+    return links
+# SCRAPING DOG API
+def upload_image_to_imgbb(image_path: str, api_key: str) -> str:
+    """Upload image to imgbb with automatic retry on transient errors."""
+    # Encode the image
+    try:
+        with open(image_path, "rb") as f:
+            image_data = base64.b64encode(f.read()).decode("utf-8")
+    except FileNotFoundError:
+        raise FileNotFoundError(f"Image file not found: {image_path}")
+    except Exception as e:
+        raise Exception(f"Error reading image file: {e}")
+    payload = {"key": api_key, "image": image_data}
+    imgbb_url = "https://api.imgbb.com/1/upload"
+    # Configure session with retry logic
+    session = requests.Session()
+    retry_strategy = Retry(
+        total=5,
+        backoff_factor=1,
+        status_forcelist=[429, 500, 502, 503, 504],
+        allowed_methods=["POST"],
+        raise_on_status=False,
+    )
+    adapter = HTTPAdapter(max_retries=retry_strategy)
+    session.mount("https://", adapter)
+    session.mount("http://", adapter)
+    try:
+        resp = session.post(imgbb_url, data=payload, timeout=30)
+        resp.raise_for_status()
+        result = resp.json()
+        if result.get("success"):
+            return result["data"]["url"]
+        else:
+            raise Exception(
+                f"imgbb upload failed: {result.get('error', 'Unknown error')}"
+            )
+    except requests.exceptions.RequestException as e:
+        raise Exception(f"Failed to upload after retries: {e}")
+def search_with_scrapingdog_lens(
+    image_path: str, imgbb_key: str, scrapingdog_key: str
+) -> dict:
+    """
+    Uploads an image to imgbb, then queries ScrapingDog's Google Lens API with 3 retries.
+    """
+    try:
+        image_url = upload_image_to_imgbb(image_path, imgbb_key)
+        logger.info(f"Image uploaded to ImgBB: {image_url}")
+        lens_url = f"https://lens.google.com/uploadbyurl?url={image_url}"
+        params = {
+            "api_key": scrapingdog_key,
+            "url": lens_url,
+            "visual_matches": "true",
+            "exact_matches": "true",
+        }
+        # Retry logic - 3 attempts
+        for attempt in range(3):
+            try:
+                resp = requests.get(
+                    "https://api.scrapingdog.com/google_lens", params=params, timeout=60
+                )
+                resp.raise_for_status()
+                return resp.json()
+            except requests.exceptions.RequestException as e:
+                logger.warning(
+                    f"⚠️ ScrapingDog attempt {attempt + 1}/3 failed for {os.path.basename(image_path)}: {e}"
+                )
+                if attempt < 2:  # Don't sleep on the last attempt
+                    time.sleep(2)  # Wait 2 seconds before retrying
+                continue
+        # All retries failed
+        logger.error(
+            f"❌ All 3 ScrapingDog attempts failed for {os.path.basename(image_path)}"
+        )
+        return {"lens_results": []}
+    except Exception as e:
+        logger.warning(f"⚠️ ScrapingDog API unexpected error for {image_path}: {e}")
+        return {"lens_results": []}
+def get_image_links_scrapingdog(search_results: dict, n_results: int = 5) -> list[str]:
+    """Get image links from Scrapingdog Lens API."""
+    return [result["link"] for result in search_results.get("lens_results", [])][
+        :n_results
+    ]
+def process_scrapingdog_only(image_path: str) -> list[str]:
+    """Process a single image with ScrapingDog API only."""
+    try:
+        scrapingdog_search_result = search_with_scrapingdog_lens(
+            image_path=image_path,
+            imgbb_key=os.environ["IMGBB_API_KEY"],
+            scrapingdog_key=os.environ["SCRAPINGDOG_API_KEY"],
+        )
+        scrapingdog_result = get_image_links_scrapingdog(
+            scrapingdog_search_result, n_results=5
+        )
+        with print_lock:
+            logger.info(
+                f"✅ ScrapingDog completed for {os.path.basename(image_path)} - {len(scrapingdog_result)} links"
+            )
+        return scrapingdog_result
+    except Exception as e:
+        with print_lock:
+            logger.error(
+                f"❌ ScrapingDog error for {os.path.basename(image_path)}: {e}"
+            )
+        return []
+# Thread-safe print lock
+print_lock = Lock()
+def process_single_image(image_path: str, imgbb_key: str, scrapingdog_key: str) -> dict:
+    """
+    Process a single image with both Vision API and ScrapingDog API.
+    Args:
+        image_path: Path to the image file
+        imgbb_key: ImgBB API key
+        scrapingdog_key: ScrapingDog API key
+    Returns:
+        Dictionary containing the results for this image
+    """
+    try:
+        # Vision API processing
+        annotations = annotate(image_path)
+        vision_result = get_image_links_vision(annotations)
+        # ScrapingDog API processing
+        scrapingdog_search_result = search_with_scrapingdog_lens(
+            image_path=image_path, imgbb_key=imgbb_key, scrapingdog_key=scrapingdog_key
+        )
+        scrapingdog_result = get_image_links_scrapingdog(
+            scrapingdog_search_result, n_results=5
+        )
+        # scrapingdog_result = []
+        result = {
+            "image_path": os.path.basename(image_path),
+            "vision_result": vision_result,
+            "scrapingdog_result": scrapingdog_result,
+        }
+        with print_lock:
+            logger.info(f"✅ Completed processing {os.path.basename(image_path)}")
+        return result
+    except Exception as e:
+        with print_lock:
+            logger.error(f"❌ Error processing {os.path.basename(image_path)}: {e}")
+        return {
+            "image_path": os.path.basename(image_path),
+            "vision_result": [],
+            "scrapingdog_result": [],
+            "error": str(e),
+        }
+def image_search_directory(
+    directory: str,
+    output_dir: str = "g3/data/prompt_data",
+    filename: str = "metadata.json",
+    imgbb_key: str = "YOUR_IMGBB_API_KEY",
+    scrapingdog_key: str = "YOUR_SCRAPINGDOG_API_KEY",
+    max_workers: int = 4,
+    target_links: int = 20,
+) -> None:
+    """
+    Perform web detection with a two-phase approach:
+    1. Run Vision API on all images first using annotate_directory
+    2. If total unique links < target_links, run ScrapingDog on images until target is reached
+    Args:
+        directory (str): Path to the directory containing image files.
+        output_dir (str): Directory to save the JSON output.
+        filename (str): Name of the JSON file to save the results.
+        imgbb_key (str): ImgBB API key for image uploading.
+        scrapingdog_key (str): ScrapingDog API key for lens search.
+        max_workers (int): Maximum number of parallel workers.
+        target_links (int): Target number of unique links to collect.
+    Returns:
+        None
+    """
+    EXCLUDE_DOMAIN = [
+        "youtube.com",
+    ]
+    # Get all image files
+    image_files = []
+    for file_name in os.listdir(directory):
+        file_path = os.path.join(directory, file_name)
+        if os.path.isfile(file_path) and file_name.lower().endswith(
+            (".jpg", ".jpeg", ".png", ".bmp", ".gif")
+        ):
+            image_files.append(file_path)
+    if not image_files:
+        logger.info("No image files found in the directory.")
+        return
+    logger.info(
+        f"Found {len(image_files)} image files. Target: {target_links} unique links"
+    )
+    # Phase 1: Run Vision API on all images using annotate_directory
+    logger.info("🔍 Phase 1: Running Vision API on all images...")
+    all_links = set()
+    vision_links_count = 0
+    try:
+        # Use the existing annotate_directory function for batch processing
+        web_detections = annotate_directory(directory)
+        # Extract links from all web detections
+        for detection in web_detections:
+            links = get_image_links_vision(detection)
+            # Filter out links from excluded domains
+            links = [
+                link for link in links if not any(domain in link for domain in EXCLUDE_DOMAIN)
+            ]
+            all_links.update(links)  # Add to set (automatically deduplicates)
+        vision_links_count = len(all_links)
+        logger.info(
+            f"✅ Phase 1 complete: {vision_links_count} unique links from Vision API"
+        )
+    except Exception as e:
+        logger.error(f"❌ Vision API processing failed: {e}")
+        all_links = set()
+        vision_links_count = 0
+    # Phase 2: Run ScrapingDog if needed
+    scrapingdog_links_count = 0
+    if len(all_links) < target_links:
+        needed_links = target_links - len(all_links)
+        logger.info(
+            f"🔍 Phase 2: Need {needed_links} more links. Running ScrapingDog..."
+        )
+        # Check if API keys are available
+        if (
+            imgbb_key == "YOUR_IMGBB_API_KEY"
+            or scrapingdog_key == "YOUR_SCRAPINGDOG_API_KEY"
+        ):
+            logger.warning("⚠️ ScrapingDog API keys not available. Skipping Phase 2.")
+        else:
+            scrapingdog_completed = 0
+            with ThreadPoolExecutor(max_workers=max_workers) as executor:
+                # Submit ScrapingDog tasks for all images
+                future_to_image = {
+                    executor.submit(process_scrapingdog_only, image_path): image_path
+                    for image_path in image_files
+                }
+                # Collect ScrapingDog results until we have enough links
+                for future in as_completed(future_to_image):
+                    image_path = future_to_image[future]
+                    try:
+                        result_links = future.result()
+                        filtered_links = [
+                            link for link in result_links if not any(domain in link for domain in EXCLUDE_DOMAIN)
+                        ]
+                        initial_count = len(filtered_links)
+                        all_links.update(filtered_links)  # Add new links to the main set
+                        scrapingdog_links_count += (
+                            len(all_links) - initial_count
+                        )  # Count new unique links added
+                        scrapingdog_completed += 1
+                        with print_lock:
+                            logger.info(
+                                f"ScrapingDog Progress: {scrapingdog_completed}/{len(image_files)} images, "
+                                f"{scrapingdog_links_count} new ScrapingDog links, {len(all_links)} total unique"
+                            )
+                        # Stop early if we have enough links
+                        if len(all_links) >= target_links:
+                            logger.info(
+                                f"🎯 Target reached! {len(all_links)} >= {target_links} links"
+                            )
+                            # Cancel remaining futures
+                            for remaining_future in future_to_image:
+                                if not remaining_future.done():
+                                    remaining_future.cancel()
+                            break
+                    except Exception as e:
+                        with print_lock:
+                            logger.error(
+                                f"❌ Failed ScrapingDog for {os.path.basename(image_path)}: {e}"
+                            )
+                        scrapingdog_completed += 1
+    # Prepare final results
+    total_unique_links = len(all_links)
+    all_links = list(all_links)[:target_links]
+    results = {
+        "all_links": all_links,
+        "total_unique_links": total_unique_links,
+        "target_achieved": total_unique_links >= target_links,
+        "summary": {
+            "images_processed": len(image_files),
+            "vision_links": vision_links_count,
+            "scrapingdog_links": scrapingdog_links_count,
+            "total_unique_links": total_unique_links,
+            "target_links": target_links,
+        },
+    }
+    # Ensure the output directory exists
+    Path(output_dir).mkdir(parents=True, exist_ok=True)
+    # Save results to JSON file
+    out_path = Path(output_dir) / filename
+    with open(out_path, "w", encoding="utf-8") as f:
+        json.dump(results, f, ensure_ascii=False, indent=2)
+    logger.info(
+        f"✅ Saved results to {out_path}\n"
+        f"📊 Summary: {vision_links_count} Vision + {scrapingdog_links_count} ScrapingDog = {total_unique_links} total unique links"
+    )

src/prompt/search/index_search.py ADDED Viewed

	@@ -0,0 +1,271 @@

+import json
+import logging
+import os
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from pathlib import Path
+from threading import Lock
+import numpy as np
+import pandas as pd
+import torch
+from PIL import Image
+logger = logging.getLogger("uvicorn.error")
+# Thread-safe lock for logging
+print_lock = Lock()
+def search_index(model, rgb_image, device, index, top_k=20):
+    """
+    Search FAISS index for similar and dissimilar coordinates using image embeddings.
+    Args:
+        model: Vision model used for embedding generation.
+        rgb_image: PIL RGB Image.
+        device: Device to run the model on (e.g., "cuda" or "cpu").
+        index: FAISS index for searching.
+        top_k (int): Number of top results to return.
+    Returns:
+        tuple: (D, I, D_reverse, I_reverse) - distances and indices for positive and negative embeddings.
+    """
+    # logger.info("Searching FAISS index...")
+    image = model.vision_processor(images=rgb_image, return_tensors="pt")[
+        "pixel_values"
+    ].reshape(-1, 224, 224)
+    image = image.unsqueeze(0).to(device)  # Add batch dimension
+    with torch.no_grad():
+        vision_output = model.vision_model(image)[1]
+        image_embeds = model.vision_projection(vision_output)
+        image_embeds = image_embeds / image_embeds.norm(p=2, dim=-1, keepdim=True)
+        image_text_embeds = model.vision_projection_else_1(
+            model.vision_projection(vision_output)
+        )
+        image_text_embeds = image_text_embeds / image_text_embeds.norm(
+            p=2, dim=-1, keepdim=True
+        )
+        image_location_embeds = model.vision_projection_else_2(
+            model.vision_projection(vision_output)
+        )
+        image_location_embeds = image_location_embeds / image_location_embeds.norm(
+            p=2, dim=-1, keepdim=True
+        )
+        positive_image_embeds = torch.cat(
+            [image_embeds, image_text_embeds, image_location_embeds], dim=1
+        )
+        positive_image_embeds = (
+            positive_image_embeds.cpu().detach().numpy().astype(np.float32)
+        )
+        negative_image_embeds = positive_image_embeds * (-1.0)
+    # Search FAISS index
+    D, I = index.search(positive_image_embeds, top_k)
+    D_reverse, I_reverse = index.search(negative_image_embeds, top_k)
+    return D, I, D_reverse, I_reverse
+def get_gps_coordinates(I, I_reverse, database_csv_path):
+    """
+    Helper method to get GPS coordinates from database using FAISS indices.
+    Args:
+        I: FAISS indices for positive embeddings
+        I_reverse: FAISS indices for negative embeddings
+        database_csv_path (str): Path to GPS coordinates database CSV
+    Returns:
+        tuple: (candidates_gps, reverse_gps) - lists of (lat, lon) tuples
+    """
+    if I is None or I_reverse is None:
+        return [], []
+    candidate_indices = I[0]
+    reverse_indices = I_reverse[0]
+    candidates_gps = []
+    reverse_gps = []
+    try:
+        for chunk in pd.read_csv(
+            database_csv_path, chunksize=10000, usecols=["LAT", "LON"]
+        ):
+            for idx in candidate_indices:
+                if idx in chunk.index:
+                    lat = float(chunk.loc[idx, "LAT"])
+                    lon = float(chunk.loc[idx, "LON"])
+                    candidates_gps.append((lat, lon))
+            for ridx in reverse_indices:
+                if ridx in chunk.index:
+                    lat = float(chunk.loc[ridx, "LAT"])
+                    lon = float(chunk.loc[ridx, "LON"])
+                    reverse_gps.append((lat, lon))
+    except Exception as e:
+        logger.error(f"⚠️ Error loading GPS coordinates from database: {e}")
+    return candidates_gps, reverse_gps
+def save_results_to_json(candidates_gps: list, reverse_gps: list, output_path: str):
+    """
+    Save search results to a JSON file.
+    Args:
+        results (dict): Search results to save.
+        output_path (str): Path to the output JSON file.
+    """
+    results = {"candidates_gps": candidates_gps, "reverse_gps": reverse_gps}
+    with open(output_path, "w") as json_file:
+        json.dump(results, json_file, indent=4)
+def process_single_image(image_path, model, device, index, database_csv_path, top_k=20):
+    """
+    Process a single image for index search.
+    Args:
+        image_path: Path to the image file
+        model: Vision model used for embedding generation
+        device: Device to run the model on
+        index: FAISS index for searching
+        database_csv_path: Path to GPS coordinates database CSV
+        top_k: Number of top results to return
+    Returns:
+        tuple: (candidates_gps, reverse_gps) for this image
+    """
+    try:
+        rgb_image = Image.open(image_path).convert("RGB")
+        D, I, D_reverse, I_reverse = search_index(
+            model, rgb_image, device, index, top_k
+        )
+        candidates_gps, reverse_gps = get_gps_coordinates(
+            I, I_reverse, database_csv_path
+        )
+        # with print_lock:
+        #     logger.info(
+        #         f"✅ Processed {os.path.basename(image_path)}: {len(candidates_gps)} candidates, {len(reverse_gps)} reverse"
+        #     )
+        return candidates_gps, reverse_gps
+    except Exception as e:
+        with print_lock:
+            logger.error(f"❌ Error processing {os.path.basename(image_path)}: {e}")
+        return [], []
+def search_index_directory(
+    model,
+    device,
+    index,
+    image_dir,
+    database_csv_path,
+    top_k=20,
+    max_elements=20,
+    max_workers=4,
+):
+    """
+    Perform FAISS index search for all images in a directory in parallel and gradually build a prioritized set of candidates.
+    Args:
+        model: Vision model used for embedding generation.
+        device: Device to run the model on (e.g., "cuda" or "cpu").
+        index: FAISS index for searching.
+        image_dir (str): Path to the directory containing images.
+        database_csv_path (str): Path to GPS coordinates database CSV.
+        top_k (int): Number of top results to return for each image.
+        max_elements (int): Maximum number of elements in the final candidates set.
+        max_workers (int): Maximum number of parallel workers.
+    Returns:
+        tuple: (candidates_gps, reverse_gps) - lists of (lat, lon) tuples.
+    """
+    # Get all image paths
+    image_paths = [
+        Path(image_dir) / img
+        for img in os.listdir(image_dir)
+        if img.lower().endswith((".jpg", ".jpeg", ".png", ".bmp"))
+    ]
+    if not image_paths:
+        logger.warning("No images found in directory")
+        return [], []
+    logger.info(
+        f"🚀 Processing {len(image_paths)} images with {max_workers} parallel workers..."
+    )
+    all_candidates_gps = []
+    all_reverse_gps = []
+    completed_count = 0
+    # Process images in parallel
+    with ThreadPoolExecutor(max_workers=max_workers) as executor:
+        # Submit all tasks
+        future_to_path = {
+            executor.submit(
+                process_single_image,
+                image_path,
+                model,
+                device,
+                index,
+                database_csv_path,
+                top_k,
+            ): image_path
+            for image_path in image_paths
+        }
+        # Collect results as they complete
+        for future in as_completed(future_to_path):
+            image_path = future_to_path[future]
+            try:
+                candidates_gps, reverse_gps = future.result()
+                all_candidates_gps.append(candidates_gps)
+                all_reverse_gps.append(reverse_gps)
+                completed_count += 1
+                with print_lock:
+                    logger.info(
+                        f"Progress: {completed_count}/{len(image_paths)} images completed"
+                    )
+            except Exception as e:
+                with print_lock:
+                    logger.error(
+                        f"❌ Failed to process {os.path.basename(image_path)}: {e}"
+                    )
+                # Add empty results for failed images
+                all_candidates_gps.append([])
+                all_reverse_gps.append([])
+                completed_count += 1
+    # Build prioritized sets from all results
+    candidates_gps = set()
+    reverse_gps = set()
+    for priority in range(top_k):
+        for image_candidates_gps, image_reverse_gps in zip(
+            all_candidates_gps, all_reverse_gps
+        ):
+            if len(candidates_gps) < max_elements and priority < len(
+                image_candidates_gps
+            ):
+                candidates_gps.add(image_candidates_gps[priority])
+            if len(reverse_gps) < max_elements and priority < len(image_reverse_gps):
+                reverse_gps.add(image_reverse_gps[priority])
+            if len(candidates_gps) >= max_elements and len(reverse_gps) >= max_elements:
+                break
+    logger.info(
+        f"🎯 Final results: {len(candidates_gps)} candidates, {len(reverse_gps)} reverse GPS coordinates"
+    )
+    return list(candidates_gps), list(reverse_gps)

src/prompt/search/text_search.py ADDED Viewed

	@@ -0,0 +1,271 @@

+import json
+import logging
+import os
+import time
+from typing import Optional
+import httpx
+from dotenv import load_dotenv
+logger = logging.getLogger("uvicorn.error")
+def retry_request(func, max_retries=3, base_delay=2.0):
+    """
+    Retry a function with exponential backoff for timeout and connection errors.
+    Args:
+        func: Function to retry
+        max_retries: Maximum number of retry attempts
+        base_delay: Base delay for exponential backoff
+    Returns:
+        Result of the function call
+    Raises:
+        Last exception if all retries fail
+    """
+    for attempt in range(max_retries):
+        try:
+            return func()
+        except (httpx.ReadTimeout, httpx.ConnectTimeout, httpx.TimeoutException) as e:
+            if attempt < max_retries - 1:
+                delay = base_delay * (2**attempt)
+                logger.warning(
+                    f"⚠️ Timeout error (attempt {attempt + 1}/{max_retries}). Retrying in {delay}s..."
+                )
+                time.sleep(delay)
+                continue
+            else:
+                logger.error(
+                    f"❌ Max retries ({max_retries}) exceeded for timeout error."
+                )
+                raise e
+        except httpx.HTTPStatusError as e:
+            if e.response.status_code in [500, 502, 503, 504]:
+                if attempt < max_retries - 1:
+                    delay = base_delay * (2**attempt)
+                    logger.warning(
+                        f"⚠️ Server error {e.response.status_code} (attempt {attempt + 1}/{max_retries}). Retrying in {delay}s..."
+                    )
+                    time.sleep(delay)
+                    continue
+            logger.error(f"❌ HTTP error {e.response.status_code}: {e}")
+            raise e
+        except Exception as e:
+            logger.error(f"❌ Unexpected error: {e}")
+            raise e
+    # Should never reach here
+    raise RuntimeError("Retry logic failed")
+def extension_from_content_type(content_type: str) -> str:
+    # Define allowed image types
+    allowed_types = {
+        "image/png": "png",
+        "image/jpeg": "jpg",
+        "image/jpg": "jpg",
+        "image/webp": "webp",
+        "image/heic": "heic",
+        "image/heif": "heif",
+    }
+    # Normalize content type (remove charset, etc.)
+    content_type = content_type.split(";")[0].strip().lower()
+    if content_type in allowed_types:
+        return allowed_types[content_type]
+    else:
+        raise ValueError(
+            f"Content type '{content_type}' is not supported. Allowed types: {list(allowed_types.keys())}"
+        )
+def text_search_image(
+    query: str,
+    num_images: int = 5,
+    api_key: str | None = None,
+    cx: str | None = None,
+    output_dir: str = "g3/data/prompt_data/images",
+    start_index: int = 0,
+) -> list[str]:
+    if not api_key or not cx:
+        raise ValueError("GOOGLE_CLOUD_API_KEY or GOOGLE_CSE_CX not set.")
+    os.makedirs(output_dir, exist_ok=True)
+    downloaded_files: list[str] = []
+    start: int = 1
+    idx = start_index
+    while len(downloaded_files) < num_images:
+        params = {
+            "q": query,
+            "searchType": "image",
+            "cx": cx,
+            "key": api_key,
+            "num": min(10, num_images - len(downloaded_files)),
+            "start": start,
+        }
+        # Use retry logic for the API request
+        try:
+            response = retry_request(
+                lambda: httpx.get(
+                    "https://customsearch.googleapis.com/customsearch/v1",
+                    params=params,
+                    timeout=30.0,  # Increased timeout
+                )
+            )
+            response.raise_for_status()
+        except Exception as e:
+            logger.error(f"❌ Failed to search for images after retries: {e}")
+            break
+        results = response.json().get("items", [])
+        if not results:
+            logger.info("No more results from API")
+            break
+        for item in results:
+            img_url: str | None = item.get("link")
+            if not img_url:
+                continue
+            try:
+                # Use retry logic for image download
+                r = retry_request(lambda url=img_url: httpx.get(url, timeout=15.0))
+                r.raise_for_status()
+                content_type = r.headers.get("Content-Type", "")
+                # Check if content type is supported before processing
+                try:
+                    ext = extension_from_content_type(content_type)
+                except ValueError as e:
+                    logger.info(f"Skipping {img_url}: {e}")
+                    continue
+                filename = os.path.join(output_dir, f"image_{idx:03d}.{ext}")
+                with open(filename, "wb") as f:
+                    f.write(r.content)
+                downloaded_files.append(filename)
+                idx += 1
+                if len(downloaded_files) >= num_images:
+                    break
+            except httpx.HTTPError as e:
+                logger.error(f"HTTP error downloading {img_url}: {e}")
+            except Exception as e:
+                logger.error(f"Failed to download {img_url}: {e}")
+        start += 10
+    return downloaded_files
+def text_search_link(
+    query: str,
+    output_dir: str = "g3/data/prompt_data",
+    filename: str = "text_search.json",
+    num_results: int = 10,
+    api_key: Optional[str] = None,
+    cx: Optional[str] = None,
+) -> str:
+    """
+    Search for web links using Google Custom Search API and save results to JSON file.
+    Args:
+        query (str): Search query string
+        output_dir (str): Directory to save the results file
+        filename (str): Name of the JSON file to save results
+        num_results (int): Number of search results to retrieve (max 100)
+        api_key (Optional[str]): Google API key, defaults to environment variable
+        cx (Optional[str]): Custom Search Engine ID, defaults to environment variable
+    Returns:
+        str: Path to the saved JSON file
+    Raises:
+        ValueError: If API key or CX not provided
+        httpx.HTTPError: If API request fails
+    """
+    if not api_key:
+        api_key = os.getenv("GOOGLE_CLOUD_API_KEY")
+    if not cx:
+        cx = os.getenv("GOOGLE_CSE_CX")
+    if not api_key or not cx:
+        raise ValueError("GOOGLE_CLOUD_API_KEY or GOOGLE_CSE_CX not set.")
+    links = []
+    start = 1
+    if not query:
+        # Prepare final results with metadata
+        search_results = {"query": query, "links": links}
+        # Save results to JSON file
+        output_path = os.path.join(output_dir, filename)
+        with open(output_path, "w", encoding="utf-8") as f:
+            json.dump(search_results, f, indent=2, ensure_ascii=False)
+        logger.info(f"✅ Saved {len(links)} search results to: {output_path}")
+        return output_path
+    os.makedirs(output_dir, exist_ok=True)
+    # Google Custom Search API allows max 10 results per request
+    while len(links) < num_results:
+        remaining = num_results - len(links)
+        current_num = min(10, remaining)
+        params = {
+            "q": query,
+            "cx": cx,
+            "key": api_key,
+            "num": current_num,
+            "start": start,
+        }
+        try:
+            response = retry_request(
+                lambda: httpx.get(
+                    "https://customsearch.googleapis.com/customsearch/v1",
+                    params=params,
+                    timeout=30.0,
+                )
+            )
+            response.raise_for_status()
+            data = response.json()
+            items = data.get("items", [])
+            if not items:
+                logger.info(
+                    f"No more results available. Retrieved {len(links)} results."
+                )
+                break
+            links.extend([item.get("link", "") for item in items if "link" in item])
+            if len(links) >= num_results:
+                break
+        except httpx.HTTPError as e:
+            logger.error(f"HTTP error during search: {e}")
+            break
+        except Exception as e:
+            logger.error(f"Error during search: {e}")
+            break
+        start += 10
+    # Ensure we only take the first num_results links
+    links = links[:num_results]
+    # Prepare final results with metadata
+    search_results = {"query": query, "links": links}
+    # Save results to JSON file
+    output_path = os.path.join(output_dir, filename)
+    with open(output_path, "w", encoding="utf-8") as f:
+        json.dump(search_results, f, indent=2, ensure_ascii=False)
+    logger.info(f"✅ Saved {len(links)} search results to: {output_path}")
+    return output_path

src/prompt/template.py ADDED Viewed

	@@ -0,0 +1,107 @@

+DIVERSIFICATION_PROMPT = """
+    You are an expert in geo-localization. Analyze the image and determine the most precise possible location—ideally identifying the exact building, landmark, or facility, not just the city.
+    Examine all provided content links in detail, using both textual and visual clues to support your conclusion.
+    Use only the provided links for evidence. Any additional links must directly support specific visual observations (e.g., satellite imagery or publicly available street-level photos of the same location).
+    Return your final answer as geographic coordinates.
+    {prompt_data}
+    Respond with **only** the following JSON structure (no extra text, markdown, or comments):
+    {{
+        "latitude": float,
+        "longitude": float,
+        "location": string,
+        "evidence": [
+            {{
+                "analysis": string,
+                "references": [string, …]
+            }}
+        ]
+    }}
+    **Guidelines:**
+    - One entry per clue (visual and textual).
+    - Each object in the "evidence" list should explain a single textual or visual clue and be as many as possible. All image in the prompt follow the format: "image_{{idx:03d}}.jpg", starting from image_000.jpg.
+    - In the "references" list, each element must be a URL or an image file name (e.g., "image_000.jpg"). They are marked with indices like [1], [2], etc in order of appearance in "references" list. "Analysis" must use these indices to cite the corresponding references.
+    - The "analysis" field must describe the clue and cite reference in its corresponding "references" using bracketed indices like [1], [2], etc. The corresponding URLs or images for those references must be included in the "references" list for that object.
+        + For contextual evidence, must cite textual/news URLs.
+        + For visual clues, cite `image_{{idx:03d}}.jpg` in `references` and any satellite/map URLs as needed.
+    - MUST use given links to support the analysis.
+    - If you can’t identify a specific building, give the city‑center coordinates.
+    """
+LOCATION_PROMPT = """
+    Location: {location}
+    Your task is to determine the geographic coordinates (latitude and longitude) of the specified location by following these steps:
+    1. Attempt to find the exact GPS coordinates using reliable online sources such as maps or satellite imagery.
+    2. If the exact location is not available, find the coordinates of a nearby or adjacent place (e.g., a recognizable landmark, building, road, or intersection).
+    3. If no specific nearby location can be found, use the coordinates of the broader area (e.g., the center of Khan Younis or Gaza).
+    4. In the "references" list, each element must be a URL or an image file name (e.g., "image_000.jpg"). They are marked with indices like [1], [2], etc in order of appearance in "references" list. "Analysis" must use these indices to cite the corresponding references.
+    Return your answer in the following JSON format:
+    {{
+      "latitude": float,
+      "longitude": float,
+      "analysis": "Describe how the coordinates were identified or approximated, including any visual or textual clues used.",
+      "references": ["URL1", "URL2", ...]
+    }}
+    - The "analysis" must clearly explain the reasoning behind the chosen coordinates.
+    - The "references" list must include all URLs cited in the analysis.
+    - Do not include any text outside of the JSON structure.
+    """
+VERIFICATION_PROMPT = """
+    You are an expert in multimedia verification. Analyze the provided content and decide if it’s authentic or fabricated. Support your conclusion with detailed, verifiable evidence.
+    {prompt_data}
+    Prediction to verify:
+    {prediction}
+    Guidelines:
+    1. Output only a JSON object with these fields:
+    {{
+        "latitude": float,
+        "longitude": float,
+        "location": string,
+        "evidence": [
+            {{
+                "analysis": string,
+                "references": [string, …]
+            }}
+        ]
+    }}
+    2. Images are named “image_{{idx:03d}}.jpg”:
+    - Images up to “image_{satellite_image_id}.jpg” were used to generate the prediction.
+    - “image_{satellite_image_id}.jpg” is the satellite reference.
+    - Images after that show the claimed location’s landmarks—use them only to confirm buildings or landmarks.
+    3. In the "references" field of response, each element must be a URL or an image file name (e.g., "image_000.jpg"). They are marked with indices like [1], [2], etc in order of appearance in "references" list. "Analysis" must use these indices to cite the corresponding references.
+    4. There must be both visual and contextual evidences. For each evidence entry:
+        a. **Visual evidence**: cross‑check the original images against the satellite view.
+            - When citing original images (those before `image_{satellite_image_id}.jpg`), **do not** list them alone: each must be accompanied by at least one supporting satellite image, street‑view photo, or map URL in the same reference list.
+            - If confirmed, **rewrite and enrich** your analysis with additional visual details (textures, angles, shadows) and cite any new image or map references.
+            - If it can’t be verified, **remove** that entry entirely.
+        b. **Contextual evidence**: verify against the provided URLs.
+            - If confirmed, **rewrite and expand** your analysis with deeper context (dates, sources, related events) and cite any new supporting links.
+            - If it can’t be verified, **remove** that entry.
+        c. Analyze but **do not** need cite transcript and metadata.
+    5. All evidence must directly support the predicted latitude/longitude. Do not include analysis or references unrelated to verifying that specific location.
+    6. Do **not** include any metadata (EXIF, timestamps, filenames) as evidence.
+    Return only the JSON—no extra text, markdown, or comments.
+    """

src/setup.py ADDED Viewed

	@@ -0,0 +1,39 @@

+import shutil
+from pathlib import Path
+from huggingface_hub import hf_hub_download
+base_path = Path(__file__).parent
+def setup(
+    local_path: Path,
+    repo_id: str,
+    filename: str,
+    subfolder: str | None = None,
+    repo_type: str | None = None,
+) -> None:
+    if not local_path.exists():
+        local_path.parent.mkdir(parents=True, exist_ok=True)
+        cached_path = hf_hub_download(
+            repo_id=repo_id,
+            subfolder=subfolder,
+            filename=filename,
+            repo_type=repo_type,
+        )
+        shutil.copy(cached_path, local_path)
+if __name__ == "__main__":
+    checkpoint_path = (
+        base_path / "data/checkpoints/mercator_finetune_weight.pth"
+    ).resolve()
+    index_path = (base_path / "data/index/G3.index").resolve()
+    database_path = (base_path / "data/dataset/mp16/MP16_Pro_filtered.csv").resolve()
+    repo_id = "tduongvn/Checkpoints-ACMMM25"
+    setup(checkpoint_path, repo_id, "mercator_finetune_weight.pth")
+    setup(index_path, repo_id, "G3.index", "index")
+    setup(database_path, repo_id, "MP16_Pro_filtered.csv", "data/mp16")

src/utils.py ADDED Viewed

	@@ -0,0 +1,312 @@

+import asyncio
+import base64
+import json
+import logging
+from pathlib import Path
+from typing import Any, Callable, Dict, List, Optional, Tuple, TypeVar, Union
+import numpy as np
+import requests
+import torch
+import torch.nn as nn
+from PIL import Image
+# Set up logger
+logger = logging.getLogger("uvicorn.error")
+T = TypeVar("T")
+NOMINATIM_URL = "https://nominatim.openstreetmap.org/search"
+DEFAULT_USER_AGENT = "keyframe_extraction_app"
+def get_gps_from_location(
+    location: str,
+    language: str = "en",
+    timeout: int = 10,
+    user_agent: str = DEFAULT_USER_AGENT,
+) -> Tuple[Optional[float], Optional[float]]:
+    """
+    Get GPS coordinates from a location string using Nominatim (OpenStreetMap).
+    Args:
+        location (str): Location string (e.g., city, address)
+        language (str): Language for results (default: 'en')
+        timeout (int): Request timeout in seconds (default: 10)
+        user_agent (str): User-Agent header (required by Nominatim)
+    Returns:
+        Tuple[Optional[float], Optional[float]]: (latitude, longitude), or (None, None) on failure
+    """
+    if not isinstance(location, str) or not location.strip():
+        logger.warning("Invalid or empty location string provided.")
+        return (None, None)
+    params = {
+        "q": location.strip(),
+        "format": "json",
+        "addressdetails": 1,
+        "accept-language": language,
+        "limit": 1,
+    }
+    headers = {
+        "User-Agent": user_agent,
+    }
+    try:
+        response = requests.get(
+            NOMINATIM_URL, params=params, headers=headers, timeout=timeout
+        )
+        response.raise_for_status()
+        data = response.json()
+        if not data:
+            logger.info(f"No results found for location: '{location}'")
+            return (None, None)
+        lat = float(data[0]["lat"])
+        lon = float(data[0]["lon"])
+        return (lat, lon)
+    except requests.RequestException as req_err:
+        logger.error(f"Request error while geocoding '{location}': {req_err}")
+    except (ValueError, KeyError, TypeError) as parse_err:
+        logger.error(
+            f"Failed to parse geocoding response for '{location}': {parse_err}"
+        )
+    return (None, None)
+def calculate_similarity_scores(
+    model: nn.Module,
+    device: torch.device,
+    predicted_coords: List[Tuple[float, float]],
+    image_dir: Union[str, Path] = "images",
+) -> np.ndarray:
+    """
+    Calculate similarity scores between images and predicted coordinates.
+    Args:
+        rgb_images: List of PIL Images
+        predicted_coords: List of (lat, lon) tuples
+    Returns:
+        np.ndarray: Average similarity scores across all images for each coordinate
+    """
+    all_similarities = []
+    image_dir = Path(image_dir)
+    if not image_dir.exists():
+        raise ValueError(f"Image directory does not exist: {image_dir}")
+    for image_file in image_dir.glob("image_*.*"):
+        # Load image as PIL Image first
+        pil_image = Image.open(image_file).convert("RGB")
+        # Process the PIL image
+        image = model.vision_processor(images=pil_image, return_tensors="pt")[
+            "pixel_values"
+        ].reshape(-1, 224, 224)
+        image = image.unsqueeze(0).to(device)
+        with torch.no_grad():
+            vision_output = model.vision_model(image)[1]
+            image_embeds = model.vision_projection_else_2(
+                model.vision_projection(vision_output)
+            )
+            image_embeds = image_embeds / image_embeds.norm(
+                p=2, dim=-1, keepdim=True
+            )  # b, 768
+            # Process coordinates
+            gps_batch = torch.tensor(predicted_coords, dtype=torch.float32).to(device)
+            gps_input = gps_batch.clone().detach().unsqueeze(0)  # Add batch dimension
+            b, c, _ = gps_input.shape
+            gps_input = gps_input.reshape(b * c, 2)
+            location_embeds = model.location_encoder(gps_input)
+            location_embeds = model.location_projection_else(
+                location_embeds.reshape(b * c, -1)
+            )
+            location_embeds = location_embeds / location_embeds.norm(
+                p=2, dim=-1, keepdim=True
+            )
+            location_embeds = location_embeds.reshape(b, c, -1)  # b, c, 768
+            similarity = torch.matmul(
+                image_embeds.unsqueeze(1), location_embeds.permute(0, 2, 1)
+            )  # b, 1, c
+            similarity = similarity.squeeze(1).cpu().detach().numpy()
+            all_similarities.append(similarity[0])  # Remove batch dimension
+    # Calculate average similarity across all images
+    avg_similarities = np.mean(all_similarities, axis=0)
+    return avg_similarities
+def is_retryable_error(error: Exception) -> bool:
+    """
+    Determines if the given exception is retryable based on known patterns
+    and exception types.
+    Args:
+        error (Exception): The exception to evaluate.
+    Returns:
+        bool: True if the error is considered retryable.
+    """
+    error_str = str(error).lower()
+    # Known substrings that indicate retryable errors
+    retryable_patterns = [
+        "503",
+        "500",
+        "502",
+        "504",
+        "overloaded",
+        "unavailable",
+        "internal",
+        "disconnected",
+        "connection",
+        "timeout",
+        "remoteprotocolerror",
+        "remote protocol error",
+        "network",
+        "socket",
+        "ssl",
+        "tls",
+        "rate limit",
+        "too many requests",
+        "429",
+        "service unavailable",
+        "temporarily unavailable",
+    ]
+    for pattern in retryable_patterns:
+        if pattern in error_str:
+            return True
+    # Retryable exception types
+    retryable_types = {
+        "connectionerror",
+        "timeout",
+        "httperror",
+        "remoteclosederror",
+        "remoteprotocolerror",
+        "sslerror",
+        "tlserror",
+        "valueerror",
+    }
+    error_type = type(error).__name__.lower()
+    return error_type in retryable_types
+async def handle_async_api_call_with_retry(
+    api_call_func: Callable[[], Any],
+    max_retries: int = 10,
+    base_delay: float = 2.0,
+    fallback_result: Optional[T] = None,
+    error_context: str = "API call",
+) -> T:
+    """
+    Executes an asynchronous API call with retry logic and exponential backoff.
+    Args:
+        api_call_func (Callable): An async function that returns any type (T).
+        max_retries (int): Maximum retry attempts.
+        base_delay (float): Initial delay for backoff (doubles each retry).
+        fallback_result (Optional[T]): Optional result to return on failure.
+        error_context (str): Contextual info for logging.
+    Returns:
+        T: Result from the API call or fallback.
+    """
+    for attempt in range(1, max_retries + 1):
+        try:
+            result = await api_call_func()
+            return result
+        except Exception as error:
+            is_last_attempt = attempt == max_retries
+            retryable = is_retryable_error(error)
+            logger.warning(
+                f"{error_context} failed (attempt {attempt}/{max_retries}): {error}"
+            )
+            if retryable and not is_last_attempt:
+                delay = base_delay * (2 ** (attempt - 1))
+                logger.info(f"Retrying in {delay:.1f}s...")
+                await asyncio.sleep(delay)
+                continue
+            if not retryable:
+                logger.error(f"Non-retryable error encountered: {error}")
+            elif is_last_attempt:
+                logger.error(f"Max retries reached for {error_context}. Giving up.")
+            break
+    if fallback_result is not None:
+        logger.warning(f"Returning fallback result for {error_context}")
+        return fallback_result
+    logger.error(f"No fallback result provided for {error_context}.")
+    raise RuntimeError(f"{error_context} failed with no result.")
+def extract_and_parse_json(raw_text: str) -> Dict[str, Any]:
+    """
+    Extract and parse the first JSON object found in raw_text.
+    Only returns a dict; falls back to {} on failure or if parsed value isn't a dict.
+    Args:
+        raw_text (str): Raw text (e.g., from an LLM response)
+    Returns:
+        Dict[str, Any]: Parsed JSON dict, or {} if none valid is found.
+    """
+    start = raw_text.find("{")
+    end = raw_text.rfind("}")
+    if start == -1 or end == -1 or end <= start:
+        logger.error("⚠️ No JSON object found. Snippet:", raw_text[:200])
+        return {}
+    snippet = raw_text[start : end + 1]
+    try:
+        parsed = json.loads(snippet)
+        if isinstance(parsed, dict):
+            return parsed
+        logger.error("⚠️ JSON parsed but not a dict—got type:", type(parsed).__name__)
+    except json.JSONDecodeError as e:
+        logger.error("⚠️ JSON decoding error:", e)
+    return {}
+def image_to_base64(image_path: Path) -> str:
+    if not image_path.is_file():
+        logger.error(f"No such image: {image_path}")
+        return ""
+    data = image_path.read_bytes()
+    return base64.b64encode(data).decode("utf-8")
+def load_images_as_base64() -> Optional[list[str]]:
+    img_dir = Path(__file__).parent / "data" / "prompt_data" / "images"
+    if not img_dir.exists() or not any(img_dir.iterdir()):
+        return None
+    base64_images: list[str] = []
+    for file in img_dir.iterdir():
+        if file.is_file() and file.suffix.lower() in [".png", ".jpg", ".jpeg", ".gif"]:
+            with open(file, "rb") as f:
+                encoded = base64.b64encode(f.read()).decode("utf-8")
+                base64_images.append(encoded)
+    return base64_images if base64_images else None