ddy0126 commited on 3 days ago

Commit

2bcab80

verified ·

1 Parent(s): 504b374

Upload folder using huggingface_hub

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +11 -0
.gitignore +211 -0
Custom_training.md +33 -0
Dockerfile +33 -0
GPT_evaluation/evaluate_benchmark.sh +51 -0
GPT_evaluation/evaluate_benchmark_1_correctness.py +186 -0
GPT_evaluation/evaluate_benchmark_2_detailed_orientation.py +186 -0
GPT_evaluation/evaluate_benchmark_3_context.py +186 -0
GPT_evaluation/evaluate_benchmark_4_temporal.py +185 -0
GPT_evaluation/evaluate_benchmark_5_consistency.py +193 -0
GPT_evaluation/evaluate_zeroshot.py +207 -0
GPT_evaluation/evaluate_zeroshot.sh +25 -0
LICENSE.md +14 -0
LICENSE_Lavis.md +14 -0
README.md +411 -0
clean_stage3_json.py +35 -0
convert_cmd_to_json.py +45 -0
convert_csv_to_json2.py +34 -0
environment.yml +317 -0
evaluation/Goldfish_eval/movies/eval_model_summary_llama_vid.sh +66 -0
evaluation/Goldfish_eval/movies/eval_model_summary_movie_chat.sh +44 -0
evaluation/Goldfish_eval/movies/eval_model_summary_movie_qa.sh +63 -0
evaluation/Goldfish_eval/movies/eval_q_related_info_llama_vid.sh +57 -0
evaluation/Goldfish_eval/movies/eval_q_related_info_movie_chat.sh +42 -0
evaluation/Goldfish_eval/movies/eval_q_related_info_movie_qa.sh +57 -0
evaluation/Goldfish_eval/movies/submit_batch_jobs_llama_vid.py +14 -0
evaluation/Goldfish_eval/movies/submit_batch_jobs_movie_qa.py +16 -0
evaluation/Goldfish_eval/movies/submit_batch_jobs_moviechat.py +14 -0
evaluation/Goldfish_eval/retrival_accuracy/eval_retrieval_acc_tvqa_job.sh +51 -0
evaluation/Goldfish_eval/retrival_accuracy/eval_retrieval_acc_tvqa_job_sub_v.sh +50 -0
evaluation/Goldfish_eval/retrival_accuracy/eval_retrieval_acc_tvqa_job_sub_v_sub.sh +51 -0
evaluation/Goldfish_eval/retrival_accuracy/eval_retrieval_acc_tvqa_job_vision_vision.sh +51 -0
evaluation/Goldfish_eval/tvqa_eval/eval_model_summary.sh +59 -0
evaluation/Goldfish_eval/tvqa_eval/eval_q_related_info.sh +71 -0
evaluation/Goldfish_eval/tvqa_eval/submit_batch_jobs.py +25 -0
evaluation/eval_goldfish_llama_vid.py +616 -0
evaluation/eval_goldfish_movie_chat.py +453 -0
evaluation/eval_goldfish_movie_qa.py +591 -0
evaluation/eval_goldfish_tvqa_long.py +535 -0
evaluation/eval_minigpt4_video.py +201 -0
evaluation/eval_retrieval_acc_tvqa.py +316 -0
filter_json.py +63 -0
goldfish_demo.py +198 -0
goldfish_inference.py +62 -0
goldfish_lv.py +654 -0
index.py +103 -0
minigpt4/__init__.py +31 -0
minigpt4/common/__init__.py +0 -0
minigpt4/common/config.py +474 -0
minigpt4/common/dist_utils.py +146 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,14 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+repo_imgs/Goldfish_results_table.JPG filter=lfs diff=lfs merge=lfs -text
+repo_imgs/MiniGPT4-video_fig.jpg filter=lfs diff=lfs merge=lfs -text
+repo_imgs/demo_1.JPG filter=lfs diff=lfs merge=lfs -text
+repo_imgs/goldfishai.jpg filter=lfs diff=lfs merge=lfs -text
+repo_imgs/goldfishai_png.png filter=lfs diff=lfs merge=lfs -text
+repo_imgs/minigpt4_demo_icon.png filter=lfs diff=lfs merge=lfs -text
+repo_imgs/online_demo.jpeg filter=lfs diff=lfs merge=lfs -text
+repo_imgs/sample_1.gif filter=lfs diff=lfs merge=lfs -text
+repo_imgs/sample_2.gif filter=lfs diff=lfs merge=lfs -text
+repo_imgs/sample_3.gif filter=lfs diff=lfs merge=lfs -text
+repo_imgs/teaser_fig_final_final.jpg filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,211 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+.idea/
+wandb/
+jobs/logs/
+*.out
+*ipynb
+.history/
+*.json
+# *.sh
+.ipynb_common
+logs/
+results/
+prompts/
+output/
+ckpt/
+divide_vqa.py
+slurm*
+sbatch_generate*
+# ignore all videos and subtitles
+*.mp4
+*.mp3
+*.vtt
+*.mkv
+*.srt
+# ignore text files
+*.txt
+# ignore *.err and *.out
+*.err
+*.out
+*.pth
+*.pt
+*.json
+# ignore workspace folder
+workspace/*
+flagged/*
+jobs_video/eval/choose_best_ckpt/*
+datasets/*
+demo_job_new.sh
+gemini_eval
+llama3.py
+evaluation_subtitles.zip
+minigpt4/models/transformers
+new_workspace
+minigpt4_video
+minigpt4_video_eval
+Infinibench
+goldfish_inference_latency.py
+run.py
+evaluation/eval_infinibench.py

Custom_training.md ADDED Viewed

	@@ -0,0 +1,33 @@

+# Customizing MiniGPT4-video for your own Video-text dataset
+## Add your own video dataloader
+Construct your own dataloader here `minigpt4/datasets/datasets/video_datasets.py` based on the existing dataloaders.<br>
+Copy Video_loader_template class and edit it according to you data nature.
+## Create config file for your dataloader
+Here `minigpt4/configs/datasets/dataset_name/default.yaml` creates your yaml file that includes paths to your dataset.<br>
+Copy the template file `minigpt4/configs/datasets/template/default.yaml` and edit the paths to your dataset.
+## Register your dataloader
+In the `minigpt4/datasets/builders/image_text_pair_builder.py` file
+Import your data loader class from the `minigpt4/datasets/datasets/video_datasets.py` file <br>
+Copy and edit the VideoTemplateBuilder class.<br>
+put the train_dataset_cls = YourVideoLoaderClass that you imported from `minigpt4/datasets/datasets/video_datasets.py` file.
+## Edit training config file
+Add your dataset to the datasets in the yml file as shown below:
+```yaml
+datasets:
+  dataset_name: # change this to your dataset name
+    batch_size: 4  # change this to your desired batch size
+    vis_processor:
+      train:
+        name: "blip2_image_train"
+        image_size: 224
+    text_processor:
+      train:
+        name: "blip_caption"
+    sample_ratio: 200 # if you including joint training with other datasets, you can set the sample ratio here
+```

Dockerfile ADDED Viewed

	@@ -0,0 +1,33 @@

+FROM pytorch/pytorch:2.2.2-cuda11.8-cudnn8-runtime
+# FROM nvidia/cuda:12.5.1-cudnn-runtime-ubuntu20.04
+# FROM nvcr.io/nvidia/pytorch:24.01-py3
+# Install necessary tools
+RUN apt-get update && apt-get install -y curl gnupg wget
+# Add the NVIDIA GPG key and repository
+RUN curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey | gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg \
+  && curl -s -L https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list | \
+    sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' | \
+    tee /etc/apt/sources.list.d/nvidia-container-toolkit.list \
+  && apt-get update
+# Install the NVIDIA container toolkit
+RUN apt-get install -y nvidia-container-toolkit
+# Set the default runtime to nvidia
+ENV NVIDIA_VISIBLE_DEVICES=all
+ENV NVIDIA_DRIVER_CAPABILITIES=compute,utility
+# RUN apt install python3-pip -y
+COPY ./ /app
+WORKDIR /app
+RUN apt-get update && apt-get install ffmpeg libsm6 libxext6  -y
+RUN apt-get install gcc -y
+RUN pip install -r requirements.txt
+ENV CUDA_VISIBLE_DEVICES=0
+ENV HF_TKN="put your huggingface token here"
+EXPOSE 7860
+CMD ["python", "minigpt4_video_demo.py"]

GPT_evaluation/evaluate_benchmark.sh ADDED Viewed

	@@ -0,0 +1,51 @@

+#!/bin/bash
+# Define common arguments for all scripts
+PRED="pred_path"
+OUTPUT_DIR="output_dir"
+API_KEY="api_key"
+NUM_TASKS=128
+# Run the "correctness" evaluation script
+python evaluate_benchmark_1_correctness.py \
+  --pred_path "${PRED_GENERIC}" \
+  --output_dir "${OUTPUT_DIR}/correctness_eval" \
+  --output_json "${OUTPUT_DIR}/correctness_results.json" \
+  --api_key $API_KEY \
+  --num_tasks $NUM_TASKS
+# Run the "detailed orientation" evaluation script
+python evaluate_benchmark_2_detailed_orientation.py \
+  --pred_path "${PRED_GENERIC}" \
+  --output_dir "${OUTPUT_DIR}/detailed_eval" \
+  --output_json "${OUTPUT_DIR}/detailed_orientation_results.json" \
+  --api_key $API_KEY \
+  --num_tasks $NUM_TASKS
+# Run the "contextual understanding" evaluation script
+python evaluate_benchmark_3_context.py \
+  --pred_path "${PRED_GENERIC}" \
+  --output_dir "${OUTPUT_DIR}/context_eval" \
+  --output_json "${OUTPUT_DIR}/contextual_understanding_results.json" \
+  --api_key $API_KEY \
+  --num_tasks $NUM_TASKS
+# Run the "temporal understanding" evaluation script
+python evaluate_benchmark_4_temporal.py \
+  --pred_path "${PRED_TEMPORAL}" \
+  --output_dir "${OUTPUT_DIR}/temporal_eval" \
+  --output_json "${OUTPUT_DIR}/temporal_understanding_results.json" \
+  --api_key $API_KEY \
+  --num_tasks $NUM_TASKS
+# Run the "consistency" evaluation script
+python evaluate_benchmark_5_consistency.py \
+  --pred_path "${PRED_CONSISTENCY}" \
+  --output_dir "${OUTPUT_DIR}/consistency_eval" \
+  --output_json "${OUTPUT_DIR}/consistency_results.json" \
+  --api_key $API_KEY \
+  --num_tasks $NUM_TASKS
+echo "All evaluations completed!"

GPT_evaluation/evaluate_benchmark_1_correctness.py ADDED Viewed

	@@ -0,0 +1,186 @@

+import openai
+import os
+import argparse
+import json
+import ast
+from multiprocessing.pool import Pool
+def parse_args():
+    parser = argparse.ArgumentParser(description="question-answer-generation-using-gpt-3")
+    parser.add_argument("--pred_path", required=True, help="The path to file containing prediction.")
+    parser.add_argument("--output_dir", required=True, help="The path to save annotation json files.")
+    parser.add_argument("--output_json", required=True, help="The path to save annotation final combined json file.")
+    parser.add_argument("--api_key", required=True, help="OpenAI API key.")
+    parser.add_argument("--num_tasks", required=True, type=int, help="Number of splits.")
+    args = parser.parse_args()
+    return args
+def annotate(prediction_set, caption_files, output_dir):
+    """
+    Evaluates question and answer pairs using GPT-3
+    Returns a score for correctness.
+    """
+    for file in caption_files:
+        key = file[:-5] # Strip file extension
+        qa_set = prediction_set[key]
+        question = qa_set['q']
+        answer = qa_set['a']
+        pred = qa_set['pred']
+        try:
+            # Compute the correctness score
+            completion = openai.ChatCompletion.create(
+                model="gpt-3.5-turbo",
+                messages=[
+                    {
+                        "role": "system",
+                        "content":
+                            "You are an intelligent chatbot designed for evaluating the factual accuracy of generative outputs for video-based question-answer pairs. "
+                            "Your task is to compare the predicted answer with the correct answer and determine if they are factually consistent. Here's how you can accomplish the task:"
+                            "------"
+                            "##INSTRUCTIONS: "
+                            "- Focus on the factual consistency between the predicted answer and the correct answer. The predicted answer should not contain any misinterpretations or misinformation.\n"
+                            "- The predicted answer must be factually accurate and align with the video content.\n"
+                            "- Consider synonyms or paraphrases as valid matches.\n"
+                            "- Evaluate the factual accuracy of the prediction compared to the answer."
+                    },
+                    {
+                        "role": "user",
+                        "content":
+                            "Please evaluate the following video-based question-answer pair:\n\n"
+                            f"Question: {question}\n"
+                            f"Correct Answer: {answer}\n"
+                            f"Predicted Answer: {pred}\n\n"
+                            "Provide your evaluation only as a factual accuracy score where the factual accuracy score is an integer value between 0 and 5, with 5 indicating the highest level of factual consistency. "
+                            "Please generate the response in the form of a Python dictionary string with keys 'score', where its value is the factual accuracy score in INTEGER, not STRING."
+                            "DO NOT PROVIDE ANY OTHER OUTPUT TEXT OR EXPLANATION. Only provide the Python dictionary string. "
+                            "For example, your response should look like this: {''score': 4.8}."
+                    }
+                ]
+            )
+            # Convert response to a Python dictionary.
+            response_message = completion["choices"][0]["message"]["content"]
+            response_dict = ast.literal_eval(response_message)
+            result_qa_pair = [response_dict, qa_set]
+            # Save the question-answer pairs to a json file.
+            with open(f"{output_dir}/{key}.json", "w") as f:
+                json.dump(result_qa_pair, f)
+        except Exception as e:
+            print(f"Error processing file '{key}': {e}")
+def main():
+    """
+    Main function to control the flow of the program.
+    """
+    # Parse arguments.
+    args = parse_args()
+    file = open(args.pred_path)
+    pred_contents = json.load(file)
+    # Dictionary to store the count of occurrences for each video_id
+    video_id_counts = {}
+    new_pred_contents = []
+    # Iterate through each sample in pred_contents
+    for sample in pred_contents:
+        video_id = sample['video_name']
+        if video_id in video_id_counts:
+            video_id_counts[video_id] += 1
+        else:
+            video_id_counts[video_id] = 0
+        # Create a new sample with the modified key
+        new_sample = sample
+        new_sample['video_name'] = f"{video_id}_{video_id_counts[video_id]}"
+        new_pred_contents.append(new_sample)
+    # Generating list of id's and corresponding files
+    id_list = [x['video_name'] for x in new_pred_contents]
+    caption_files = [f"{id}.json" for id in id_list]
+    output_dir = args.output_dir
+    # Generate output directory if not exists.
+    if not os.path.exists(output_dir):
+        os.makedirs(output_dir)
+    # Preparing dictionary of question-answer sets
+    prediction_set = {}
+    for sample in new_pred_contents:
+        id = sample['video_name']
+        question = sample['Q']
+        answer = sample['A']
+        pred = sample['pred']
+        qa_set = {"q": question, "a": answer, "pred": pred}
+        prediction_set[id] = qa_set
+    # Set the OpenAI API key.
+    openai.api_key = args.api_key
+    num_tasks = args.num_tasks
+    # While loop to ensure that all captions are processed.
+    while True:
+        try:
+            # Files that have not been processed yet.
+            completed_files = os.listdir(output_dir)
+            print(f"completed_files: {len(completed_files)}")
+            # Files that have not been processed yet.
+            incomplete_files = [f for f in caption_files if f not in completed_files]
+            print(f"incomplete_files: {len(incomplete_files)}")
+            # Break the loop when there are no incomplete files
+            if len(incomplete_files) == 0:
+                break
+            if len(incomplete_files) <= num_tasks:
+                num_tasks = 1
+            # Split tasks into parts.
+            part_len = len(incomplete_files) // num_tasks
+            all_parts = [incomplete_files[i:i + part_len] for i in range(0, len(incomplete_files), part_len)]
+            task_args = [(prediction_set, part, args.output_dir) for part in all_parts]
+            # Use a pool of workers to process the files in parallel.
+            with Pool() as pool:
+                pool.starmap(annotate, task_args)
+        except Exception as e:
+            print(f"Error: {e}")
+    # Combine all the processed files into one
+    combined_contents = {}
+    json_path = args.output_json
+    # Iterate through json files
+    for file_name in os.listdir(output_dir):
+        if file_name.endswith(".json"):
+            file_path = os.path.join(output_dir, file_name)
+            with open(file_path, "r") as json_file:
+                content = json.load(json_file)
+                combined_contents[file_name[:-5]] = content
+    # Write combined content to a json file
+    with open(json_path, "w") as json_file:
+        json.dump(combined_contents, json_file)
+    print("All evaluation completed!")
+    # Calculate average score
+    score_sum = 0
+    count = 0
+    for key, result in combined_contents.items():
+        count += 1
+        score_match = result[0]['score']
+        score = int(score_match)
+        score_sum += score
+    average_score = score_sum / count
+    print("Average score for correctness:", average_score)
+if __name__ == "__main__":
+    main()

GPT_evaluation/evaluate_benchmark_2_detailed_orientation.py ADDED Viewed

	@@ -0,0 +1,186 @@

+import openai
+import os
+import argparse
+import json
+import ast
+from multiprocessing.pool import Pool
+def parse_args():
+    parser = argparse.ArgumentParser(description="question-answer-generation-using-gpt-3")
+    parser.add_argument("--pred_path", required=True, help="The path to file containing prediction.")
+    parser.add_argument("--output_dir", required=True, help="The path to save annotation json files.")
+    parser.add_argument("--output_json", required=True, help="The path to save annotation final combined json file.")
+    parser.add_argument("--api_key", required=True, help="OpenAI API key.")
+    parser.add_argument("--num_tasks", required=True, type=int, help="Number of splits.")
+    args = parser.parse_args()
+    return args
+def annotate(prediction_set, caption_files, output_dir):
+    """
+    Evaluates question and answer pairs using GPT-3 and
+    returns a score for detailed orientation.
+    """
+    for file in caption_files:
+        key = file[:-5] # Strip file extension
+        qa_set = prediction_set[key]
+        question = qa_set['q']
+        answer = qa_set['a']
+        pred = qa_set['pred']
+        try:
+            # Compute the detailed-orientation score
+            completion = openai.ChatCompletion.create(
+                model="gpt-3.5-turbo",
+                messages=[
+                    {
+                        "role": "system",
+                        "content":
+                            "You are an intelligent chatbot designed for evaluating the detail orientation of generative outputs for video-based question-answer pairs. "
+                            "Your task is to compare the predicted answer with the correct answer and determine its level of detail, considering both completeness and specificity. Here's how you can accomplish the task:"
+                            "------"
+                            "##INSTRUCTIONS: "
+                            "- Check if the predicted answer covers all major points from the video. The response should not leave out any key aspects.\n"
+                            "- Evaluate whether the predicted answer includes specific details rather than just generic points. It should provide comprehensive information that is tied to specific elements of the video.\n"
+                            "- Consider synonyms or paraphrases as valid matches.\n"
+                            "- Provide a single evaluation score that reflects the level of detail orientation of the prediction, considering both completeness and specificity."
+                    },
+                    {
+                        "role": "user",
+                        "content":
+                            "Please evaluate the following video-based question-answer pair:\n\n"
+                            f"Question: {question}\n"
+                            f"Correct Answer: {answer}\n"
+                            f"Predicted Answer: {pred}\n\n"
+                            "Provide your evaluation only as a detail orientation score where the detail orientation score is an integer value between 0 and 5, with 5 indicating the highest level of detail orientation. "
+                            "Please generate the response in the form of a Python dictionary string with keys 'score', where its value is the detail orientation score in INTEGER, not STRING."
+                            "DO NOT PROVIDE ANY OTHER OUTPUT TEXT OR EXPLANATION. Only provide the Python dictionary string. "
+                            "For example, your response should look like this: {''score': 4.8}."
+                    }
+                ]
+            )
+            # Convert response to a Python dictionary.
+            response_message = completion["choices"][0]["message"]["content"]
+            response_dict = ast.literal_eval(response_message)
+            result_qa_pair = [response_dict, qa_set]
+            # Save the question-answer pairs to a json file.
+            with open(f"{output_dir}/{key}.json", "w") as f:
+                json.dump(result_qa_pair, f)
+        except Exception as e:
+            print(f"Error processing file '{key}': {e}")
+def main():
+    """
+    Main function to control the flow of the program.
+    """
+    # Parse arguments.
+    args = parse_args()
+    file = open(args.pred_path)
+    pred_contents = json.load(file)
+    # Dictionary to store the count of occurrences for each video_id
+    video_id_counts = {}
+    new_pred_contents = []
+    # Iterate through each sample in pred_contents
+    for sample in pred_contents:
+        video_id = sample['video_name']
+        if video_id in video_id_counts:
+            video_id_counts[video_id] += 1
+        else:
+            video_id_counts[video_id] = 0
+        # Create a new sample with the modified key
+        new_sample = sample
+        new_sample['video_name'] = f"{video_id}_{video_id_counts[video_id]}"
+        new_pred_contents.append(new_sample)
+    # Generating list of id's and corresponding files
+    id_list = [x['video_name'] for x in new_pred_contents]
+    caption_files = [f"{id}.json" for id in id_list]
+    output_dir = args.output_dir
+    # Generate output directory if not exists.
+    if not os.path.exists(output_dir):
+        os.makedirs(output_dir)
+    # Preparing dictionary of question-answer sets
+    prediction_set = {}
+    for sample in new_pred_contents:
+        id = sample['video_name']
+        question = sample['Q']
+        answer = sample['A']
+        pred = sample['pred']
+        qa_set = {"q": question, "a": answer, "pred": pred}
+        prediction_set[id] = qa_set
+    # Set the OpenAI API key.
+    openai.api_key = args.api_key
+    num_tasks = args.num_tasks
+    # While loop to ensure that all captions are processed.
+    while True:
+        try:
+            # Files that have not been processed yet.
+            completed_files = os.listdir(output_dir)
+            print(f"completed_files: {len(completed_files)}")
+            # Files that have not been processed yet.
+            incomplete_files = [f for f in caption_files if f not in completed_files]
+            print(f"incomplete_files: {len(incomplete_files)}")
+            # Break the loop when there are no incomplete files
+            if len(incomplete_files) == 0:
+                break
+            if len(incomplete_files) <= num_tasks:
+                num_tasks = 1
+            # Split tasks into parts.
+            part_len = len(incomplete_files) // num_tasks
+            all_parts = [incomplete_files[i:i + part_len] for i in range(0, len(incomplete_files), part_len)]
+            task_args = [(prediction_set, part, args.output_dir) for part in all_parts]
+            # Use a pool of workers to process the files in parallel.
+            with Pool() as pool:
+                pool.starmap(annotate, task_args)
+        except Exception as e:
+            print(f"Error: {e}")
+    # Combine all the processed files into one
+    combined_contents = {}
+    json_path = args.output_json
+    # Iterate through json files
+    for file_name in os.listdir(output_dir):
+        if file_name.endswith(".json"):
+            file_path = os.path.join(output_dir, file_name)
+            with open(file_path, "r") as json_file:
+                content = json.load(json_file)
+                combined_contents[file_name[:-5]] = content
+    # Write combined content to a json file
+    with open(json_path, "w") as json_file:
+        json.dump(combined_contents, json_file)
+    print("All evaluation completed!")
+    # Calculate average score
+    score_sum = 0
+    count = 0
+    for key, result in combined_contents.items():
+        count += 1
+        score_match = result[0]['score']
+        score = int(score_match)
+        score_sum += score
+    average_score = score_sum / count
+    print("Average score for detailed orientation:", average_score)
+if __name__ == "__main__":
+    main()

GPT_evaluation/evaluate_benchmark_3_context.py ADDED Viewed

	@@ -0,0 +1,186 @@

+import openai
+import os
+import argparse
+import json
+import ast
+from multiprocessing.pool import Pool
+def parse_args():
+    parser = argparse.ArgumentParser(description="question-answer-generation-using-gpt-3")
+    parser.add_argument("--pred_path", required=True, help="The path to file containing prediction.")
+    parser.add_argument("--output_dir", required=True, help="The path to save annotation json files.")
+    parser.add_argument("--output_json", required=True, help="The path to save annotation final combined json file.")
+    parser.add_argument("--api_key", required=True, help="OpenAI API key.")
+    parser.add_argument("--num_tasks", required=True, type=int, help="Number of splits.")
+    args = parser.parse_args()
+    return args
+def annotate(prediction_set, caption_files, output_dir):
+    """
+    Evaluates question and answer pairs using GPT-3 and
+    returns a score for contextual understanding.
+    """
+    for file in caption_files:
+        key = file[:-5] # Strip file extension
+        qa_set = prediction_set[key]
+        question = qa_set['q']
+        answer = qa_set['a']
+        pred = qa_set['pred']
+        try:
+            # Compute the contextual understanding score
+            completion = openai.ChatCompletion.create(
+                model="gpt-3.5-turbo",
+                messages=[
+                    {
+                        "role": "system",
+                        "content":
+                            "You are an intelligent chatbot designed for evaluating the contextual understanding of generative outputs for video-based question-answer pairs. "
+                            "Your task is to compare the predicted answer with the correct answer and determine if the generated response aligns with the overall context of the video content. Here's how you can accomplish the task:"
+                            "------"
+                            "##INSTRUCTIONS: "
+                            "- Evaluate whether the predicted answer aligns with the overall context of the video content. It should not provide information that is out of context or misaligned.\n"
+                            "- The predicted answer must capture the main themes and sentiments of the video.\n"
+                            "- Consider synonyms or paraphrases as valid matches.\n"
+                            "- Provide your evaluation of the contextual understanding of the prediction compared to the answer."
+                    },
+                    {
+                        "role": "user",
+                        "content":
+                            "Please evaluate the following video-based question-answer pair:\n\n"
+                            f"Question: {question}\n"
+                            f"Correct Answer: {answer}\n"
+                            f"Predicted Answer: {pred}\n\n"
+                            "Provide your evaluation only as a contextual understanding score where the contextual understanding score is an integer value between 0 and 5, with 5 indicating the highest level of contextual understanding. "
+                            "Please generate the response in the form of a Python dictionary string with keys 'score', where its value is contextual understanding score in INTEGER, not STRING."
+                            "DO NOT PROVIDE ANY OTHER OUTPUT TEXT OR EXPLANATION. Only provide the Python dictionary string. "
+                            "For example, your response should look like this: {''score': 4.8}."
+                    }
+                ]
+            )
+            # Convert response to a Python dictionary.
+            response_message = completion["choices"][0]["message"]["content"]
+            response_dict = ast.literal_eval(response_message)
+            result_qa_pair = [response_dict, qa_set]
+            # Save the question-answer pairs to a json file.
+            with open(f"{output_dir}/{key}.json", "w") as f:
+                json.dump(result_qa_pair, f)
+        except Exception as e:
+            print(f"Error processing file '{key}': {e}")
+def main():
+    """
+    Main function to control the flow of the program.
+    """
+    # Parse arguments.
+    args = parse_args()
+    file = open(args.pred_path)
+    pred_contents = json.load(file)
+    # Dictionary to store the count of occurrences for each video_id
+    video_id_counts = {}
+    new_pred_contents = []
+    # Iterate through each sample in pred_contents
+    for sample in pred_contents:
+        video_id = sample['video_name']
+        if video_id in video_id_counts:
+            video_id_counts[video_id] += 1
+        else:
+            video_id_counts[video_id] = 0
+        # Create a new sample with the modified key
+        new_sample = sample
+        new_sample['video_name'] = f"{video_id}_{video_id_counts[video_id]}"
+        new_pred_contents.append(new_sample)
+    # Generating list of id's and corresponding files
+    id_list = [x['video_name'] for x in new_pred_contents]
+    caption_files = [f"{id}.json" for id in id_list]
+    output_dir = args.output_dir
+    # Generate output directory if not exists.
+    if not os.path.exists(output_dir):
+        os.makedirs(output_dir)
+    # Preparing dictionary of question-answer sets
+    prediction_set = {}
+    for sample in new_pred_contents:
+        id = sample['video_name']
+        question = sample['Q']
+        answer = sample['A']
+        pred = sample['pred']
+        qa_set = {"q": question, "a": answer, "pred": pred}
+        prediction_set[id] = qa_set
+    # Set the OpenAI API key.
+    openai.api_key = args.api_key
+    num_tasks = args.num_tasks
+    # While loop to ensure that all captions are processed.
+    while True:
+        try:
+            # Files that have not been processed yet.
+            completed_files = os.listdir(output_dir)
+            print(f"completed_files: {len(completed_files)}")
+            # Files that have not been processed yet.
+            incomplete_files = [f for f in caption_files if f not in completed_files]
+            print(f"incomplete_files: {len(incomplete_files)}")
+            # Break the loop when there are no incomplete files
+            if len(incomplete_files) == 0:
+                break
+            if len(incomplete_files) <= num_tasks:
+                num_tasks = 1
+            # Split tasks into parts.
+            part_len = len(incomplete_files) // num_tasks
+            all_parts = [incomplete_files[i:i + part_len] for i in range(0, len(incomplete_files), part_len)]
+            task_args = [(prediction_set, part, args.output_dir) for part in all_parts]
+            # Use a pool of workers to process the files in parallel.
+            with Pool() as pool:
+                pool.starmap(annotate, task_args)
+        except Exception as e:
+            print(f"Error: {e}")
+    # Combine all the processed files into one
+    combined_contents = {}
+    json_path = args.output_json
+    # Iterate through json files
+    for file_name in os.listdir(output_dir):
+        if file_name.endswith(".json"):
+            file_path = os.path.join(output_dir, file_name)
+            with open(file_path, "r") as json_file:
+                content = json.load(json_file)
+                combined_contents[file_name[:-5]] = content
+    # Write combined content to a json file
+    with open(json_path, "w") as json_file:
+        json.dump(combined_contents, json_file)
+    print("All evaluation completed!")
+    # Calculate average score
+    score_sum = 0
+    count = 0
+    for key, result in combined_contents.items():
+        count += 1
+        score_match = result[0]['score']
+        score = int(score_match)
+        score_sum += score
+    average_score = score_sum / count
+    print("Average score for contextual understanding:", average_score)
+if __name__ == "__main__":
+    main()

GPT_evaluation/evaluate_benchmark_4_temporal.py ADDED Viewed

	@@ -0,0 +1,185 @@

+import openai
+import os
+import argparse
+import json
+import ast
+from multiprocessing.pool import Pool
+def parse_args():
+    parser = argparse.ArgumentParser(description="question-answer-generation-using-gpt-3")
+    parser.add_argument("--pred_path", required=True, help="The path to file containing prediction.")
+    parser.add_argument("--output_dir", required=True, help="The path to save annotation json files.")
+    parser.add_argument("--output_json", required=True, help="The path to save annotation final combined json file.")
+    parser.add_argument("--api_key", required=True, help="OpenAI API key.")
+    parser.add_argument("--num_tasks", required=True, type=int, help="Number of splits.")
+    args = parser.parse_args()
+    return args
+def annotate(prediction_set, caption_files, output_dir):
+    """
+    Evaluates question and answer pairs using GPT-3 and
+    returns a score for temporal understanding.
+    """
+    for file in caption_files:
+        key = file[:-5] # Strip file extension
+        qa_set = prediction_set[key]
+        question = qa_set['q']
+        answer = qa_set['a']
+        pred = qa_set['pred']
+        try:
+            # Compute the temporal understanding score
+            completion = openai.ChatCompletion.create(
+                model="gpt-3.5-turbo",
+                messages=[
+                    {
+                        "role": "system",
+                        "content":
+                            "You are an intelligent chatbot designed for evaluating the temporal understanding of generative outputs for video-based question-answer pairs. "
+                            "Your task is to compare the predicted answer with the correct answer and determine if they correctly reflect the temporal sequence of events in the video content. Here's how you can accomplish the task:"
+                            "------"
+                            "##INSTRUCTIONS: "
+                            "- Focus on the temporal consistency between the predicted answer and the correct answer. The predicted answer should correctly reflect the sequence of events or details as they are presented in the video content.\n"
+                            "- Consider synonyms or paraphrases as valid matches, but only if the temporal order is maintained.\n"
+                            "- Evaluate the temporal accuracy of the prediction compared to the answer."
+                    },
+                    {
+                        "role": "user",
+                        "content":
+                            "Please evaluate the following video-based question-answer pair:\n\n"
+                            f"Question: {question}\n"
+                            f"Correct Answer: {answer}\n"
+                            f"Predicted Answer: {pred}\n\n"
+                            "Provide your evaluation only as a temporal accuracy score where the temporal accuracy score is an integer value between 0 and 5, with 5 indicating the highest level of temporal consistency. "
+                            "Please generate the response in the form of a Python dictionary string with keys 'score', where its value is the temporal accuracy score in INTEGER, not STRING."
+                            "DO NOT PROVIDE ANY OTHER OUTPUT TEXT OR EXPLANATION. Only provide the Python dictionary string. "
+                            "For example, your response should look like this: {''score': 4.8}."
+                    }
+                ]
+            )
+            # Convert response to a Python dictionary.
+            response_message = completion["choices"][0]["message"]["content"]
+            response_dict = ast.literal_eval(response_message)
+            result_qa_pair = [response_dict, qa_set]
+            # Save the question-answer pairs to a json file.
+            with open(f"{output_dir}/{key}.json", "w") as f:
+                json.dump(result_qa_pair, f)
+        except Exception as e:
+            print(f"Error processing file '{key}': {e}")
+def main():
+    """
+    Main function to control the flow of the program.
+    """
+    # Parse arguments.
+    args = parse_args()
+    file = open(args.pred_path)
+    pred_contents = json.load(file)
+    # Dictionary to store the count of occurrences for each video_id
+    video_id_counts = {}
+    new_pred_contents = []
+    # Iterate through each sample in pred_contents
+    for sample in pred_contents:
+        video_id = sample['video_name']
+        if video_id in video_id_counts:
+            video_id_counts[video_id] += 1
+        else:
+            video_id_counts[video_id] = 0
+        # Create a new sample with the modified key
+        new_sample = sample
+        new_sample['video_name'] = f"{video_id}_{video_id_counts[video_id]}"
+        new_pred_contents.append(new_sample)
+    # Generating list of id's and corresponding files
+    id_list = [x['video_name'] for x in new_pred_contents]
+    caption_files = [f"{id}.json" for id in id_list]
+    output_dir = args.output_dir
+    # Generate output directory if not exists.
+    if not os.path.exists(output_dir):
+        os.makedirs(output_dir)
+    # Preparing dictionary of question-answer sets
+    prediction_set = {}
+    for sample in new_pred_contents:
+        id = sample['video_name']
+        question = sample['Q']
+        answer = sample['A']
+        pred = sample['pred']
+        qa_set = {"q": question, "a": answer, "pred": pred}
+        prediction_set[id] = qa_set
+    # Set the OpenAI API key.
+    openai.api_key = args.api_key
+    num_tasks = args.num_tasks
+    # While loop to ensure that all captions are processed.
+    while True:
+        try:
+            # Files that have not been processed yet.
+            completed_files = os.listdir(output_dir)
+            print(f"completed_files: {len(completed_files)}")
+            # Files that have not been processed yet.
+            incomplete_files = [f for f in caption_files if f not in completed_files]
+            print(f"incomplete_files: {len(incomplete_files)}")
+            # Break the loop when there are no incomplete files
+            if len(incomplete_files) == 0:
+                break
+            if len(incomplete_files) <= num_tasks:
+                num_tasks = 1
+            # Split tasks into parts.
+            part_len = len(incomplete_files) // num_tasks
+            all_parts = [incomplete_files[i:i + part_len] for i in range(0, len(incomplete_files), part_len)]
+            task_args = [(prediction_set, part, args.output_dir) for part in all_parts]
+            # Use a pool of workers to process the files in parallel.
+            with Pool() as pool:
+                pool.starmap(annotate, task_args)
+        except Exception as e:
+            print(f"Error: {e}")
+    # Combine all the processed files into one
+    combined_contents = {}
+    json_path = args.output_json
+    # Iterate through json files
+    for file_name in os.listdir(output_dir):
+        if file_name.endswith(".json"):
+            file_path = os.path.join(output_dir, file_name)
+            with open(file_path, "r") as json_file:
+                content = json.load(json_file)
+                combined_contents[file_name[:-5]] = content
+    # Write combined content to a json file
+    with open(json_path, "w") as json_file:
+        json.dump(combined_contents, json_file)
+    print("All evaluation completed!")
+    # Calculate average score
+    score_sum = 0
+    count = 0
+    for key, result in combined_contents.items():
+        count += 1
+        score_match = result[0]['score']
+        score = int(score_match)
+        score_sum += score
+    average_score = score_sum / count
+    print("Average score temporal understanding:", average_score)
+if __name__ == "__main__":
+    main()

GPT_evaluation/evaluate_benchmark_5_consistency.py ADDED Viewed

	@@ -0,0 +1,193 @@

+import openai
+import os
+import argparse
+import json
+import ast
+from multiprocessing.pool import Pool
+def parse_args():
+    parser = argparse.ArgumentParser(description="question-answer-generation-using-gpt-3")
+    parser.add_argument("--pred_path", required=True, help="The path to file containing prediction.")
+    parser.add_argument("--output_dir", required=True, help="The path to save annotation json files.")
+    parser.add_argument("--output_json", required=True, help="The path to save annotation final combined json file.")
+    parser.add_argument("--api_key", required=True, help="OpenAI API key.")
+    parser.add_argument("--num_tasks", required=True, type=int, help="Number of splits.")
+    args = parser.parse_args()
+    return args
+def annotate(prediction_set, caption_files, output_dir):
+    """
+    Evaluates question and answer pairs using GPT-3 and
+    returns a score for consistency.
+    """
+    for file in caption_files:
+        key = file[:-5] # Strip file extension
+        qa_set = prediction_set[key]
+        question1 = qa_set['q1']
+        question2 = qa_set['q2']
+        answer = qa_set['a']
+        pred1 = qa_set['pred1']
+        pred2 = qa_set['pred2']
+        try:
+            # Compute the consistency score
+            completion = openai.ChatCompletion.create(
+                model="gpt-3.5-turbo",
+                messages=[
+                    {
+                        "role": "system",
+                        "content":
+                            "You are an intelligent chatbot designed for evaluating the consistency of generative outputs for similar video-based question-answer pairs. "
+                            "You will be given two very similar questions, a common answer common to both the questions and predicted answers for the two questions ."
+                            "Your task is to compare the predicted answers for two very similar question, with a common correct answer and determine if they are consistent. Here's how you can accomplish the task:"
+                            "------"
+                            "##INSTRUCTIONS: "
+                            "- Focus on the consistency between the two predicted answers and the correct answer. Both predicted answers should correspond to the correct answer and to each other, and should not contain any contradictions or significant differences in the conveyed information.\n"
+                            "- Both predicted answers must be consistent with each other and the correct answer, in terms of the information they provide about the video content.\n"
+                            "- Consider synonyms or paraphrases as valid matches, but only if they maintain the consistency in the conveyed information.\n"
+                            "- Evaluate the consistency of the two predicted answers compared to the correct answer."
+                    },
+                    {
+                        "role": "user",
+                        "content":
+                            "Please evaluate the following video-based question-answer pair:\n\n"
+                            f"Question 1: {question1}\n"
+                            f"Question 2: {question2}\n"
+                            f"Correct Answer: {answer}\n"
+                            f"Predicted Answer to Question 1: {pred1}\n"
+                            f"Predicted Answer to Question 2: {pred2}\n\n"
+                            "Provide your evaluation only as a consistency score where the consistency score is an integer value between 0 and 5, with 5 indicating the highest level of consistency. "
+                            "Please generate the response in the form of a Python dictionary string with keys 'score', where its value is the consistency score in INTEGER, not STRING."
+                            "DO NOT PROVIDE ANY OTHER OUTPUT TEXT OR EXPLANATION. Only provide the Python dictionary string. "
+                            "For example, your response should look like this: {''score': 4.8}."
+                    }
+                ]
+            )
+            # Convert response to a Python dictionary.
+            response_message = completion["choices"][0]["message"]["content"]
+            response_dict = ast.literal_eval(response_message)
+            result_qa_pair = [response_dict, qa_set]
+            # Save the question-answer pairs to a json file.
+            with open(f"{output_dir}/{key}.json", "w") as f:
+                json.dump(result_qa_pair, f)
+        except Exception as e:
+            print(f"Error processing file '{key}': {e}")
+def main():
+    """
+    Main function to control the flow of the program.
+    """
+    # Parse arguments.
+    args = parse_args()
+    file = open(args.pred_path)
+    pred_contents = json.load(file)
+    # Dictionary to store the count of occurrences for each video_id
+    video_id_counts = {}
+    new_pred_contents = []
+    # Iterate through each sample in pred_contents
+    for sample in pred_contents:
+        video_id = sample['video_name']
+        if video_id in video_id_counts:
+            video_id_counts[video_id] += 1
+        else:
+            video_id_counts[video_id] = 0
+        # Create a new sample with the modified key
+        new_sample = sample
+        new_sample['video_name'] = f"{video_id}_{video_id_counts[video_id]}"
+        new_pred_contents.append(new_sample)
+    # Generating list of id's and corresponding files
+    id_list = [x['video_name'] for x in new_pred_contents]
+    caption_files = [f"{id}.json" for id in id_list]
+    output_dir = args.output_dir
+    # Generate output directory if not exists.
+    if not os.path.exists(output_dir):
+        os.makedirs(output_dir)
+    # Preparing dictionary of question-answer sets
+    prediction_set = {}
+    for sample in new_pred_contents:
+        id = sample['video_name']
+        question1 = sample['Q1']
+        question2 = sample['Q1']
+        answer = sample['A']
+        pred1 = sample['pred1']
+        pred2 = sample['pred2']
+        qa_set = {"q1": question1, "q2": question2, "a": answer, "pred1": pred1, "pred2": pred2}
+        prediction_set[id] = qa_set
+    # Set the OpenAI API key.
+    openai.api_key = args.api_key
+    num_tasks = args.num_tasks
+    # While loop to ensure that all captions are processed.
+    while True:
+        try:
+            # Files that have not been processed yet.
+            completed_files = os.listdir(output_dir)
+            print(f"completed_files: {len(completed_files)}")
+            # Files that have not been processed yet.
+            incomplete_files = [f for f in caption_files if f not in completed_files]
+            print(f"incomplete_files: {len(incomplete_files)}")
+            # Break the loop when there are no incomplete files
+            if len(incomplete_files) == 0:
+                break
+            if len(incomplete_files) <= num_tasks:
+                num_tasks = 1
+            # Split tasks into parts.
+            part_len = len(incomplete_files) // num_tasks
+            all_parts = [incomplete_files[i:i + part_len] for i in range(0, len(incomplete_files), part_len)]
+            task_args = [(prediction_set, part, args.output_dir) for part in all_parts]
+            # Use a pool of workers to process the files in parallel.
+            with Pool() as pool:
+                pool.starmap(annotate, task_args)
+        except Exception as e:
+            print(f"Error: {e}")
+    # Combine all the processed files into one
+    combined_contents = {}
+    json_path = args.output_json
+    # Iterate through json files
+    for file_name in os.listdir(output_dir):
+        if file_name.endswith(".json"):
+            file_path = os.path.join(output_dir, file_name)
+            with open(file_path, "r") as json_file:
+                content = json.load(json_file)
+                combined_contents[file_name[:-5]] = content
+    # Write combined content to a json file
+    with open(json_path, "w") as json_file:
+        json.dump(combined_contents, json_file)
+    print("All evaluation completed!")
+    # Calculate average score
+    score_sum = 0
+    count = 0
+    for key, result in combined_contents.items():
+        count += 1
+        score_match = result[0]['score']
+        score = int(score_match)
+        score_sum += score
+    average_score = score_sum / count
+    print("Average score for consistency:", average_score)
+if __name__ == "__main__":
+    main()

GPT_evaluation/evaluate_zeroshot.py ADDED Viewed

	@@ -0,0 +1,207 @@

+import openai
+import os
+import argparse
+import json
+import ast
+from multiprocessing.pool import Pool
+def parse_args():
+    parser = argparse.ArgumentParser(description="question-answer-generation-using-gpt-3")
+    parser.add_argument("--pred_path", required=True, help="The path to file containing prediction.")
+    parser.add_argument("--output_dir", required=True, help="The path to save annotation json files.")
+    parser.add_argument("--output_json", required=True, help="The path to save annotation final combined json file.")
+    parser.add_argument("--api_key", required=True, help="OpenAI API key.")
+    parser.add_argument("--num_tasks", required=True, type=int, help="Number of splits.")
+    args = parser.parse_args()
+    return args
+def annotate(prediction_set, caption_files, output_dir):
+    """
+    Evaluates question and answer pairs using GPT-3
+    Returns a score for correctness.
+    """
+    for file in caption_files:
+        key = file[:-5] # Strip file extension
+        qa_set = prediction_set[key]
+        question = qa_set['q']
+        answer = qa_set['a']
+        pred = qa_set['pred']
+        try:
+            # Compute the correctness score
+            completion = openai.ChatCompletion.create(
+                model="gpt-3.5-turbo",
+                messages=[
+                    {
+                        "role": "system",
+                        "content":
+                            "You are an intelligent chatbot designed for evaluating the correctness of generative outputs for question-answer pairs. "
+                            "Your task is to compare the predicted answer with the correct answer and determine if they match meaningfully. Here's how you can accomplish the task:"
+                            "------"
+                            "##INSTRUCTIONS: "
+                            "- Focus on the meaningful match between the predicted answer and the correct answer.\n"
+                            "- Consider synonyms or paraphrases as valid matches.\n"
+                            "- Evaluate the correctness of the prediction compared to the answer."
+                    },
+                    {
+                        "role": "user",
+                        "content":
+                            "Please evaluate the following video-based question-answer pair:\n\n"
+                            f"Question: {question}\n"
+                            f"Correct Answer: {answer}\n"
+                            f"Predicted Answer: {pred}\n\n"
+                            "Provide your evaluation only as a yes/no and score where the score is an integer value between 0 and 5, with 5 indicating the highest meaningful match. "
+                            "Please generate the response in the form of a Python dictionary string with keys 'pred' and 'score', where value of 'pred' is  a string of 'yes' or 'no' and value of 'score' is in INTEGER, not STRING."
+                            "DO NOT PROVIDE ANY OTHER OUTPUT TEXT OR EXPLANATION. Only provide the Python dictionary string. "
+                            "For example, your response should look like this: {'pred': 'yes', 'score': 4.8}."
+                    }
+                ]
+            )
+            # Convert response to a Python dictionary.
+            response_message = completion["choices"][0]["message"]["content"]
+            response_dict = ast.literal_eval(response_message)
+            result_qa_pair = [response_dict, qa_set]
+            # Save the question-answer pairs to a json file.
+            with open(f"{output_dir}/{key}.json", "w") as f:
+                json.dump(result_qa_pair, f)
+        except Exception as e:
+            print(f"Error processing file '{key}': {e}")
+def main():
+    """
+    Main function to control the flow of the program.
+    """
+    # Parse arguments.
+    args = parse_args()
+    file = open(args.pred_path)
+    pred_contents = json.load(file)
+    # Dictionary to store the count of occurrences for each video_id
+    video_id_counts = {}
+    new_pred_contents = []
+    # Iterate through each sample in pred_contents
+    for sample in pred_contents:
+        video_id = sample['video_name']
+        if video_id in video_id_counts:
+            video_id_counts[video_id] += 1
+        else:
+            video_id_counts[video_id] = 0
+        # Create a new sample with the modified key
+        new_sample = sample
+        new_sample['video_name'] = f"{video_id}_{video_id_counts[video_id]}"
+        new_pred_contents.append(new_sample)
+    # Generating list of id's and corresponding files
+    id_list = [x['video_name'] for x in new_pred_contents]
+    caption_files = [f"{id}.json" for id in id_list]
+    output_dir = args.output_dir
+    # Generate output directory if not exists.
+    if not os.path.exists(output_dir):
+        os.makedirs(output_dir)
+    # Preparing dictionary of question-answer sets
+    prediction_set = {}
+    for sample in new_pred_contents:
+        id = sample['video_name']
+        question = sample['Q']
+        answer = sample['A']
+        pred = sample['pred']
+        qa_set = {"q": question, "a": answer, "pred": pred}
+        prediction_set[id] = qa_set
+    # Set the OpenAI API key.
+    openai.api_key = args.api_key
+    num_tasks = args.num_tasks
+    # While loop to ensure that all captions are processed.
+    while True:
+        try:
+            # Files that have not been processed yet.
+            completed_files = os.listdir(output_dir)
+            print(f"completed_files: {len(completed_files)}")
+            # Files that have not been processed yet.
+            incomplete_files = [f for f in caption_files if f not in completed_files]
+            print(f"incomplete_files: {len(incomplete_files)}")
+            # Break the loop when there are no incomplete files
+            if len(incomplete_files) == 0:
+                break
+            if len(incomplete_files) <= num_tasks:
+                num_tasks = 1
+            # Split tasks into parts.
+            part_len = len(incomplete_files) // num_tasks
+            all_parts = [incomplete_files[i:i + part_len] for i in range(0, len(incomplete_files), part_len)]
+            task_args = [(prediction_set, part, args.output_dir) for part in all_parts]
+            # Use a pool of workers to process the files in parallel.
+            with Pool() as pool:
+                pool.starmap(annotate, task_args)
+        except Exception as e:
+            print(f"Error: {e}")
+    # Combine all the processed files into one
+    combined_contents = {}
+    json_path = args.output_json
+    # Iterate through json files
+    for file_name in os.listdir(output_dir):
+        if file_name.endswith(".json"):
+            file_path = os.path.join(output_dir, file_name)
+            with open(file_path, "r") as json_file:
+                content = json.load(json_file)
+                combined_contents[file_name[:-5]] = content
+    # Write combined content to a json file
+    with open(json_path, "w") as json_file:
+        json.dump(combined_contents, json_file)
+    print("All evaluation completed!")
+    # Calculate average score and accuracy
+    score_sum = 0
+    count = 0
+    yes_count = 0
+    no_count = 0
+    for key, result in combined_contents.items():
+        # Computing score
+        count += 1
+        try :
+            score_match = result[0]['score']
+            score = int(score_match)
+            score_sum += score
+        except:
+            print("Score not found for", key)
+            continue
+        # Computing accuracy
+        try:
+            pred = result[0]['pred']
+            if "yes" in pred.lower():
+                yes_count += 1
+            elif "no" in pred.lower():
+                no_count += 1
+        except:
+            print("Prediction not found for", key)
+            continue
+    average_score = score_sum / count
+    accuracy = yes_count / (yes_count + no_count)
+    print("Yes count:", yes_count)
+    print("No count:", no_count)
+    print("Accuracy:", accuracy)
+    print("Average score:", average_score)
+if __name__ == "__main__":
+    main()

GPT_evaluation/evaluate_zeroshot.sh ADDED Viewed

	@@ -0,0 +1,25 @@

+#!/bin/bash
+#SBATCH --partition=batch
+#SBATCH --job-name=zeroshot_eval%j
+#SBATCH --output=zeroshot_eval%j.out
+#SBATCH --error=zeroshot_eval%j.err
+#SBATCH --time=0-10:00:00
+#SBATCH --mem=64G
+#SBATCH --nodes=1
+## run the application:
+# PRED="pred_path"
+# OUTPUT_DIR="output_dir"
+# API_KEY="api_key"
+# NUM_TASKS=128
+python evaluate_zeroshot.py \
+    --pred_path ${PRED} \
+    --output_dir "${OUTPUT_DIR}/fewshot_accuracy" \
+    --output_json "${OUTPUT_DIR}/fewshot_accuracy_results.json"\
+    --api_key $API_KEY \
+    --num_tasks $NUM_TASKS
+echo pred_path: $PRED

LICENSE.md ADDED Viewed

	@@ -0,0 +1,14 @@

+BSD 3-Clause License
+Copyright 2023 Deyao Zhu
+All rights reserved.
+Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
+3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

LICENSE_Lavis.md ADDED Viewed

	@@ -0,0 +1,14 @@

+BSD 3-Clause License
+Copyright (c) 2022 Salesforce, Inc.
+All rights reserved.
+Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
+3. Neither the name of Salesforce.com nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

README.md ADDED Viewed

	@@ -0,0 +1,411 @@

+# [ECCV 2024 Accepted]Goldfish: Vision-Language Understanding of Arbitrarily Long Videos
+# [CVPR2024W]MiniGPT4-Video: Advancing Multimodal LLMs for Video Understanding with Interleaved Visual-Textual Tokens
+**This repo contains the codes for MiniGPT4-video for short video understanding and Goldfish for long video understanding.**
+<h3 style="text-align: center;">Online Demos</h3>
+<div style="display: flex; justify-content: center; gap: 40px;">
+    <div style="text-align: center;">
+        <a href='https://goldfishdemo.loophole.site'>
+            <img src='repo_imgs/goldfishai_png.png' width=200 height=200>
+        </a>
+        <div>
+            <font size=3>
+                <div>
+                    <img src="repo_imgs/goldfishai_png.png" width=18>
+                    <a href="https://vision-cair.github.io/Goldfish_website/">Project Page</a>
+                    <a href="https://arxiv.org/abs/2407.12679">📝 arXiv Paper</a>
+                    <a href="https://huggingface.co/datasets/Vision-CAIR/TVQA-Long/tree/main">🤗 TVQA-Long Dataset</a>
+                </div>
+            </font>
+        </div>
+    </div>
+    <div style="text-align: center;">
+        <a href='https://huggingface.co/spaces/Vision-CAIR/MiniGPT4-video'>
+            <img src='repo_imgs/minigpt4_demo_icon.png' width=200 height=200>
+        </a>
+        <div>
+            <font size=3>
+                <div>
+                    <a href="https://vision-cair.github.io/MiniGPT4-video/">🎞️ Project Page</a>
+                    <a href="https://arxiv.org/abs/2404.03413">📝 arXiv Paper</a>
+                </div>
+            </font>
+        </div>
+    </div>
+</div>
+![Goldfish_teaser_fig](repo_imgs/teaser_fig_final_final.jpg)
+## Overview
+Most current LLM-based models for video understanding can
+process videos within minutes but struggle with processing lengthy videos
+due to the “noise and redundancy challenge” and “memory and compu-
+tation” challenges. In this paper, we present Goldfish, a methodology
+tailored for comprehending videos of arbitrary lengths. We also introduce
+the TVQA-long benchmark, specifically designed to evaluate models’
+capabilities in understanding long videos with questions in both vision
+and text content. Goldfish approaches these challenges with an efficient
+retrieval mechanism that initially gathers the top-k video clips relevant to
+the instruction before proceeding to provide the desired response. This de-
+sign of the retrieval mechanism enables the Goldfish to efficiently process
+arbitrarily long video sequences, facilitating its application in contexts
+such as movies or television series. To facilitate the retrieval process, we
+developed MiniGPT4-Video that generates detailed descriptions for the
+video clips. In addressing the scarcity of benchmarks for long video evalu-
+ation, we adapted the TVQA short video benchmark for extended content
+analysis by aggregating questions from entire episodes, thereby shifting
+the evaluation from partial to full episode comprehension. We attained a
+41.78% accuracy rate on the TVQA-long benchmark, surpassing previous
+methods by 14.94%. Our MiniGPT4-Video also shows exceptional perfor-
+mance in short video comprehension, exceeding existing state-of-the-art
+methods by 3.23%, 2.03%, 16.5% and 23.59% on the MSVD, MSRVTT,
+TGIF,and TVQA short video benchmarks, respectively. These results
+indicate that our models have significant improvements in both long and
+short-video understanding.
+### Goldfish framework (Long videos)
+![methodology](repo_imgs/goldfish_framework.JPG)<br>
+![Gold ish demo](repo_imgs/demo_1.JPG)
+### MiniGPT4-Video  (Short videos)
+![methodology](repo_imgs/final_short_video_model.jpg)
+[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/minigpt4-video-advancing-multimodal-llms-for/zeroshot-video-question-answer-on-tgif-qa)](https://paperswithcode.com/sota/zeroshot-video-question-answer-on-tgif-qa?p=minigpt4-video-advancing-multimodal-llms-for)
+[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/minigpt4-video-advancing-multimodal-llms-for/zero-shot-video-question-answer-on-tvqa)](https://paperswithcode.com/sota/zero-shot-video-question-answer-on-tvqa?p=minigpt4-video-advancing-multimodal-llms-for)
+[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/minigpt4-video-advancing-multimodal-llms-for/video-based-generative-performance-1)](https://paperswithcode.com/sota/video-based-generative-performance-1?p=minigpt4-video-advancing-multimodal-llms-for)
+[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/minigpt4-video-advancing-multimodal-llms-for/video-based-generative-performance-3)](https://paperswithcode.com/sota/video-based-generative-performance-3?p=minigpt4-video-advancing-multimodal-llms-for)
+[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/minigpt4-video-advancing-multimodal-llms-for/video-based-generative-performance-4)](https://paperswithcode.com/sota/video-based-generative-performance-4?p=minigpt4-video-advancing-multimodal-llms-for)
+[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/minigpt4-video-advancing-multimodal-llms-for/video-based-generative-performance-5)](https://paperswithcode.com/sota/video-based-generative-performance-5?p=minigpt4-video-advancing-multimodal-llms-for)
+[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/minigpt4-video-advancing-multimodal-llms-for/video-based-generative-performance-2)](https://paperswithcode.com/sota/video-based-generative-performance-2?p=minigpt4-video-advancing-multimodal-llms-for)
+[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/minigpt4-video-advancing-multimodal-llms-for/zeroshot-video-question-answer-on-msvd-qa)](https://paperswithcode.com/sota/zeroshot-video-question-answer-on-msvd-qa?p=minigpt4-video-advancing-multimodal-llms-for)
+[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/minigpt4-video-advancing-multimodal-llms-for/zeroshot-video-question-answer-on-msrvtt-qa)](https://paperswithcode.com/sota/zeroshot-video-question-answer-on-msrvtt-qa?p=minigpt4-video-advancing-multimodal-llms-for)
+[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/minigpt4-video-advancing-multimodal-llms-for/zeroshot-video-question-answer-on-activitynet)](https://paperswithcode.com/sota/zeroshot-video-question-answer-on-activitynet?p=minigpt4-video-advancing-multimodal-llms-for)
+![demo_1](repo_imgs/sample_1.gif)
+![demo_2](repo_imgs/sample_2.gif)
+![demo_3](repo_imgs/sample_3.gif)
+## :rocket: Demo
+**1. Clone the repository** <br>
+```bash
+git clone https://github.com/Vision-CAIR/MiniGPT4-video.git
+cd MiniGPT4-video
+```
+**2. Set up the environment** <br>
+```bash
+conda env create -f environment.yml
+```
+**3. Download the checkpoints**
+| MiniGPT4-Video (Llama2 Chat 7B) | MiniGPT4-Video (Mistral 7B) |
+:------------------------------------------------------------------------------------------------:|:----------------------------------------------------------------------------------------------:
+| [Download](https://huggingface.co/Vision-CAIR/MiniGPT4-Video/blob/main/checkpoints/video_llama_checkpoint_last.pth) | [Download](https://huggingface.co/Vision-CAIR/MiniGPT4-Video/blob/main/checkpoints/video_mistral_checkpoint_last.pth) |
+**4. Run the demo** <br>
+Goldfish demo
+```bash
+# For recommended performance, add the parameter --use_openai_embedding True to the command below and set the API key in the environment variable OPENAI_API_KEY otherwise the model will use the default embeddings.
+export OPENAI_API_KEY="your_openai_key"
+# Llama2
+python goldfish_demo.py --ckpt path_to_video_checkpoint --cfg-path test_configs/llama2_test_config.yaml
+# Mistral
+python goldfish_demo.py --ckpt path_to_video_checkpoint --cfg-path test_configs/mistral_test_config.yaml
+```
+MiniGPT4-Video demo
+```bash
+# Llama2
+python minigpt4_video_demo.py --ckpt path_to_video_checkpoint --cfg-path test_configs/llama2_test_config.yaml
+# Mistral
+python minigpt4_video_demo.py --ckpt path_to_video_checkpoint --cfg-path test_configs/mistral_test_config.yaml
+```
+### Inference
+Do the previous steps and replace step 4 with this step <br>
+Goldfish inference
+```bash
+# For recommended performance, add the parameter --use_openai_embedding True to the command below and set the API key in the environment variable OPENAI_API_KEY otherwise the model will use the default embeddings.
+export OPENAI_API_KEY="your_openai_key"
+# Llama2
+python goldfish_inference.py --ckpt path_to_llama2_checkpoint --cfg-path test_configs/llama2_test_config.yaml --video_path path_to_video --question "Your question here"
+# Mistral
+python goldfish_inference.py --ckpt path_to_mistral_checkpoint --cfg-path test_configs/mistral_test_config.yaml --video_path path_to_video --question "Your question here"
+```
+MiniGPT4-Video inference
+```bash
+# Llama2
+python minigpt4_video_inference.py --ckpt path_to_llama2_checkpoint --cfg-path test_configs/llama2_test_config.yaml --video_path path_to_video --question "Your question here"
+# Mistral
+python minigpt4_video_inference.py --ckpt path_to_mistral_checkpoint --cfg-path test_configs/mistral_test_config.yaml --video_path path_to_video --question "Your question here"
+```
+## :fire: Training
+For both Goldfish and MiniGPT4-Video, the only training part is the MiniGPT4-Video model. <br>
+### To customize MiniGPT4-Video for your own Video-text dataset
+<!-- point to file here Custom_training.md -->
+You can find the steps to customize MiniGPT4-Video for your own video-text dataset in [Custom_training.md](Custom_training.md)
+### Training datasets
+After downloading the datasets below, **you should go to the datasets configuration folder here minigpt4/configs/datasets set the paths for each dataset there.**<br>
+Image text training<br>
+You can find the steps to download the datasets in [MiniGPT4](https://github.com/Vision-CAIR/MiniGPT-4/tree/main/dataset)<br>
++ LAION <br>
++ Conceptual Captions <br>
++ SBU <br>
+Video text training:<br>
++ [CMD](https://www.robots.ox.ac.uk/~vgg/data/condensed-movies/) <br>
++ [Webvid](https://github.com/m-bain/webvid/) <br> <!-- + [Webvid](https://huggingface.co/datasets/TempoFunk/webvid-10M?row=2) <br> -->
++ [Video Instructional Dataset 100K](https://huggingface.co/datasets/MBZUAI/VideoInstruct-100K) <br>
+You can find the datasets annotation files for video_text datasets here [download](https://huggingface.co/Vision-CAIR/MiniGPT4-Video/tree/main/datasets/training_datasets) <br>
+### Model training:
+You can edit the number of gpus in the each script.sh below<br>
+#### Stage 1 (image text pretraining)
+You can directly download the pretrained MiniGPT4 [checkpoint](https://drive.google.com/file/d/11nAPjEok8eAGGEG1N2vXo3kBLCg0WgUk/view?usp=sharing) aligned with Llama2. <br>
+Or train by yourself:
+```bash
+# pretrain
+# Llama2
+torchrun --nproc-per-node NUM_GPU train.py --cfg-path train_configs/224_minigpt4_llama2_image.yaml
+# Mistral
+torchrun --nproc-per-node NUM_GPU train.py --cfg-path train_configs/224_minigpt4_mistral_image.yaml
+# align
+# To launch the second stage alignment, first specify the path to the checkpoint file trained in pretrain stage.
+# Llama2
+torchrun --nproc-per-node NUM_GPU train.py --cfg-path train_configs/224_minigpt4_llama2_image_align.yaml
+# Mistral
+torchrun --nproc-per-node NUM_GPU train.py --cfg-path train_configs/224_minigpt4_mistral_image_align.yaml
+```
+You can download our trained weights for this stage from here [Llama2](https://huggingface.co/Vision-CAIR/MiniGPT4-Video/blob/main/checkpoints/image_llama2_checkpoint.pth) [Mistral](https://huggingface.co/Vision-CAIR/MiniGPT4-Video/blob/main/checkpoints/image_mistral_checkpoint.pth)<br>
+#### Stage 2 (video captioning pretraining)
+For **Llama2** <br>
+set the cfg-path in the script to `train_configs/224_v2_llama2_video_stage_2.yaml` <br>
+set the model name here `minigpt4/configs/datasets/cmd_video/default.yaml` and `minigpt4/configs/datasets/webvid/default.yaml` to llama2<br>
+For **Mistral**<br>
+set the cfg-path in the script to `train_configs/224_v2_mistral_video_stage_2.yaml` <br>
+set the model name here `minigpt4/configs/datasets/cmd_video/default.yaml` and `minigpt4/configs/datasets/webvid/default.yaml` to mistral<br>
+```bash
+bash training_scripts/stage_2.sh
+```
+You can download our trained weights for this stage from here [Llama2](https://huggingface.co/Vision-CAIR/MiniGPT4-Video/blob/main/checkpoints/video_captioning_llama_checkpoint_last.pth) [Mistral](https://huggingface.co/Vision-CAIR/MiniGPT4-Video/blob/main/checkpoints/video_captioning_mistral_checkpoint_last.pth)<br>
+#### Stage 3 (video Instruction finetuning)
+For **Llama2** <br>
+set the cfg-path in the script to `train_configs/224_v2_llama2_video_stage_3.yaml` <br>
+set the model name here `minigpt4/configs/datasets/video_chatgpt/default.yaml` to llama2<br>
+For **Mistral**<br>
+set the cfg-path in the script to `train_configs/224_v2_mistral_video_stage_3.yaml` <br>
+set the model name here `minigpt4/configs/datasets/video_chatgpt/default.yaml` to mistral<br>
+```bash
+bash training_scripts/stage_3.sh
+```
+You can download our trained weights for this stage from here [Llama2](https://huggingface.co/Vision-CAIR/MiniGPT4-Video/blob/main/checkpoints/video_llama_checkpoint_last.pth) [Mistral](https://huggingface.co/Vision-CAIR/MiniGPT4-Video/blob/main/checkpoints/video_mistral_checkpoint_last.pth)<br>
+## :zap: MiniGPT4-Video Evaluation
+To reproduce the results use the best checkpoints for each model <br>
+[Llama2](https://huggingface.co/Vision-CAIR/MiniGPT4-Video/blob/main/checkpoints/video_llama_checkpoint_best.pth) [Mistral](https://huggingface.co/Vision-CAIR/MiniGPT4-Video/blob/main/checkpoints/video_mistral_checkpoint_best.pth)<br>
+We used the same evaluation as [Video-ChatGPT](https://mbzuai-oryx.github.io/Video-ChatGPT/)<br>
+|Method| Using Subtitles | Information Correctness | Detailed Orientation | Contextual Understanding | Temporal Understanding | Consistency |
+|:--------------------:|:----:|:------------------------:|:---------------------:|:-------------------------:|:-----------------------:|:------------:|
+| LLaMA Adapter | :x:| 2.03 | 2.32| 2.30| 1.98| 2.15 |
+| Video LLaMA| :x:| 1.96 | 2.18| 2.16| 1.82| 1.79 |
+| Video Chat| :x:| 2.23 | 2.50| 2.53| 1.94| 2.24 |
+| Video-ChatGPT | :x:| 2.40 | 2.52| 2.62| 1.98| 2.37 |
+| BT-Adapter-7B | :x:| 2.68 | 2.69| 3.27| 2.34| 2.46 |
+| LLaMA-VID-7B| :x:| 2.96 | 3.00| 3.53| 2.46| 2.51 |
+| **Ours-7B Llama2**| :x:| 2.93 | 2.97| 3.45| **2.47**| **2.60**|
+| **Ours-7B Llama2**| :white_check_mark:| **3.08** | **3.02**| **3.57**| **2.65**| **2.67**|
+| **Ours-7B Mistral** | :x:| 2.83|2.52 |3.01 |2.32 |2.40 |
+| **Ours-7B Mistral**| :white_check_mark:| 2.91 | 2.57| 3.11|2.33 | 2.39|
+|Method| Using Subtitles | MSVD Acc.↑ | MSVD Score↑ | MSRVTT Acc.↑ | MSRVTT Score↑ | TGIF Acc.↑ | TGIF Score↑ | ActivityNet Acc.↑ | ActivityNet Score↑ | TVQA Acc.↑ |
+|:---------------------------------------:|:----------------:|:-----------:|:------------:|:--------------:|:---------------:|:-----------:|:------------:|:-------------------:|:--------------------:|:------------:|
+| FrozenBiLM|:x:|32.2| --|16.8 |--| 41 |-- |24.7|--|29.7 |
+| LLaMA Adapter|:x:|54.9| 3.1 |43.8 |2.7| -- |-- |34.2| 2.7| --|
+| Video LLaMA|:x:|51.6| 2.5 |29|1.8| -- |-- |12.4| 1.1| --|
+| Video Chat|:x:|56.3| 2.8 |45|2.5|34.4| 2.3 |26.5| 2.2|--|
+| Video-ChatGPT|:x:|64.9| 3.3 |49.3 |2.8|51.4| 3.0 |35.2| 2.7|23.35|
+| BT-Adapter-7B|:x:|67.7| 3.7 |57|3.2| -- |-- |45.7| 3.2| --|
+| LLaMA-VID-7B |:x:|69.7| 3.7 |57.7 |3.2| -- |-- |**47.4**| **3.3**| --|
+| **Ours-7B LLama2**|:x:|72.93|3.84|58.83|3.29|67.9|3.71| 45.85 |3.23|36.45|
+| **Ours-7B Llama2**|:white_check_mark:|72.93|3.84|**59.73**|**3.3** |67.9|3.71| 46.3|3.4 |46.94|
+| **Ours-7B Mistral**|:x:|**73.92**|**4.06**|58.26|3.52|**72.22**|**4.08**|44.25 |3.35|33.90|
+| **Ours-7B Mistral**|:white_check_mark:|**73.92**|**4.06**|58.68|3.53 |**72.22**|**4.08**| 44.38|3.36 |**54.21** |
+### Download datasets for evaluation
++ [MSVD](https://www.cs.utexas.edu/users/ml/clamp/videoDescription/) <br>
++ [MSRVTT](https://cove.thecvf.com/datasets/839) <br>
++ [TGIF](https://github.com/YunseokJANG/tgif-qa/blob/master/dataset/README.md) <br>
++ [ActivityNet](https://mbzuaiac-my.sharepoint.com/:u:/g/personal/hanoona_bangalath_mbzuai_ac_ae/ESa302OCJMNHsMk7wuBbQc8BZH5CqlcdCWiSpXynQZDfAQ?e=CrOPbm) <br>
++ [TVQA](https://nlp.cs.unc.edu/data/jielei/tvqa/tvqa_public_html/download_tvqa.html) <br>
++ [Video-ChatGPT benchmark](https://mbzuai-oryx.github.io/Video-ChatGPT/) <br>
+You can find the evaluation datasets annotation files [download](https://huggingface.co/Vision-CAIR/MiniGPT4-Video/tree/main/datasets/evaluation_datasets) <br>
+Subtitles for MSR-VTT,and ActivityNet are availabe here  [download](https://huggingface.co/Vision-CAIR/MiniGPT4-Video/resolve/main/datasets/evaluation_subtitles.zip)
+note these subtitles are generated using <a href="https://github.com/openai/whisper">whisper model<br>
+TVQA subtitles can be downloaded from [here](https://nlp.cs.unc.edu/data/jielei/tvqa/tvqa_public_html/download_tvqa.html)
+### Run evaluation script
+Set the each evaluation script parameters in the script <br>
+```
+NAME="" # Name of the experiment
+BATCH_SIZE=8 # batch size
+CKPT_PATH="" # path to the checkpoint
+DATASET="msvd" # dataset name, available datasets: tvqa, msrvtt, msvd, activitynet,tgif ,video_chatgpt_generic,video_chatgpt_temporal,video_chatgpt_consistency
+# set the paths to the dataset files
+videos_path="" # path to the videos file
+subtitles_path="" # path to the subtitles file if the dataset is msrvtt, activitynet or tvqa else set it to ""
+ann_path="" # path to the annotations file
+cfg_path="" # path to the config file
+```
+<br>
+```bash
+bash evaluation/minigpt4_video_eval/minigpt4_video_evalualtion.sh
+```
+Then Use GPT3.5 turbo to compare the predictions with the ground truth and generate the accuracy and scores <br>
+Set these variables in both evaluate_benchmark.sh and evaluate_zeroshot.sh <br>
+```bash
+PRED="path_to_predictions"
+OUTPUT_DIR="path_to_output_dir"
+API_KEY="openAI_key"
+NUM_TASKS=128
+```
+Then to evaluate [Video-ChatGPT benchmark] run the following script <br>
+```bash
+bash GPT_evaluation/evaluate_benchmark.sh
+```
+To evaluate open ended questions run the following script <br>
+```bash
+bash GPT_evaluation/evaluate_zeroshot.py
+```
+## :zap: Goldfish Evaluation
+**Long video benchmarking results on four benchmarks: LLama-Vid, MovieChat, Movie QA, and our proposed TVQA-Long. The "V" modality indicates the use of video frames only, while "V+T" indicates the use of both video frames and subtitles**
+<!-- ![Goldfish results](repo_imgs/Goldfish_results_table.JPG) -->
+| Method      | Modalities | LLama-Vid Acc.↑ | LLama-Vid Score↑ | MovieChat Acc.↑ | MovieChat Score↑ | Movie QA Acc.↑ | Movie QA Score↑ | TVQA-Long Acc.↑ | TVQA-Long Score↑ |
+|-------------|------------|-----------------|------------------|-----------------|------------------|----------------|-----------------|------------|-------------|
+| LLAMA-VID   | V          | 20.68           | 2.41             | 53.2            | 3.81             | 24.42          | 2.19            | 24.63      | 2.16        |
+| MovieChat   | V          | 11.71           | 1.45             | NA              | NA               | 16.18          | 1.68            | 5.0        | 0.86        |
+| Ours        | V          | **23.09**       | 2.19             | **67.6**        | **4.23**         | **28.49**      | **2.8**         | **28.61**  | **2.78**    |
+| LLAMA-VID   | V+T        | 41.4†           | 3.07†            | NA              | NA               | 37.65†         | 3.03†           | 26.86      | 2.21        |
+| Ours        | V+T        | 31.49           | 2.48             | NA              | NA               | 35.24          | **3.1**             | **41.78**  | **3.21**    |
+**Note: The dagger † symbol indicates the method used the benchmark during training, which implies unfair comparison.**
+To reproduce the results use the `checkpoints/video_llama_checkpoint_last.pth`  and use openAI embedding `--use_openai_embedding=True`<br>
+### Download datasets for evaluation
+For **Llama-vid** and **MovieQA** <br>
+Dowlnoad the original MovieNet data with movies and annotations from [here](https://opendatalab.com/OpenDataLab/MovieNet/tree/main/raw)<br>
+This will be the souce videos for LLama-vid and MovieQA <br>
+#### Filtered Annotations same as illestrated in the paper and used for evaluation
+[Llama-vid](https://huggingface.co/Vision-CAIR/MiniGPT4-Video/tree/main/datasets/goldfish_eval_datasets/llama_vid)<br>
+[MovieQA](https://huggingface.co/Vision-CAIR/MiniGPT4-Video/tree/main/datasets/goldfish_eval_datasets/movie_qa)<br>
+For **Moviechat** the only available videos while implementing this work is 10 % of the training data and this what we used for evalaution and can be found [here](https://huggingface.co/Vision-CAIR/MiniGPT4-Video/blob/main/datasets/goldfish_eval_datasets/movie_chat/available_movies_list.txt) <br>
+Full dataset can be found [here](https://huggingface.co/datasets/Enxin/MovieChat-1K_train/tree/main) <br>
+For **TVQA-Long** <br>
+if you want to use TVQA-Long for another model (llama-vid),both videos and annotations can be found here [TVQA-Long](https://huggingface.co/datasets/Vision-CAIR/TVQA-Long/tree/main).
+For Goldfish evalaution we will use the separated clips from the original TVQA dataset <br>
+### Run the evaluation scripts
+``` bash
+# Llama-vid evalauation
+# set these parameters in the script
+videos_path="path to the videos"
+subtitle_path="path to the subtitles"
+video_clips_saving_path="path to save the video clips"
+annotation_file="path to the annotation file"
+movienet_annotations_dir="path to the movienet annotations directory"
+NEIGHBOURS=3
+use_openai_embedding="whether to use openai embeddings or not"
+# then run the script
+bash evaluation/Goldfish_eval/movies/eval_model_summary_llama_vid.sh
+# MovieQA evaluation
+# same as above but set the parameters in the script to the MovieQA paths
+bash evaluation/Goldfish_eval/movies/eval_model_summary_movie_qa.sh
+# MovieChat evaluation
+# set these parameters in the script
+dataset_path="path to the movies folder"
+annotation_json_folder="path to the jsons folder"
+# then run the script
+bash evaluation/Goldfish_eval/movies/eval_model_summary_movie_chat.sh
+```
+### TVQA-Long
+For Goldfish evaluation we can use the original separated clips from the original TVQA dataset <br>
+Download the original TVQA videos and clips subtitles for short videos from [here](https://nlp.cs.unc.edu/data/jielei/tvqa/tvqa_public_html/download_tvqa.html)<br>
+tvqa_long_annotation [here](https://huggingface.co/Vision-CAIR/MiniGPT4-Video/tree/main/datasets/goldfish_eval_datasets/tvqa/tvqa_val_edited.json) <br>
+tvqa_json_subtitles [here](https://huggingface.co/Vision-CAIR/MiniGPT4-Video/tree/main/datasets/goldfish_eval_datasets/tvqa/tvqa_preprocessed_subtitles.json)<br>
+```bash
+# set these parameters in the script
+tvqa_json_subtitles="path to the tvqa json subtitles file"
+tvqa_clips_subtitles="path to the tvqa clips subtitles"
+videos_frames="path to the video frames"
+tvqa_long_annotation="path to the TVQA-Long annotation file"
+NEIGHBOURS= 3
+use_openai_embedding="whether to use openai embeddings or not"
+# then run the script
+bash evaluation/Goldfish_eval/tvqa_eval/eval_model_summary.sh
+````
+Then Use GPT3.5 turbo to compare the predictions with the ground truth and generate the accuracy and scores <br>
+Set these variables in evaluate_zeroshot.sh <br>
+```bash
+PRED="path_to_predictions"
+OUTPUT_DIR="path_to_output_dir"
+API_KEY="openAI_key"
+NUM_TASKS=128
+```
+To evaluate open ended questions run the following script <br>
+```bash
+bash GPT_evaluation/evaluate_zeroshot.sh
+```
+## Citation
+If you're using MiniGPT4-Video or Goldfish in your research or applications, please cite using this BibTeX:
+```
+@misc{ataallah2024goldfishvisionlanguageunderstandingarbitrarily,
+      title={Goldfish: Vision-Language Understanding of Arbitrarily Long Videos},
+      author={Kirolos Ataallah and Xiaoqian Shen and Eslam Abdelrahman and Essam Sleiman and Mingchen Zhuge and Jian Ding and Deyao Zhu and Jürgen Schmidhuber and Mohamed Elhoseiny},
+      year={2024},
+      eprint={2407.12679},
+      archivePrefix={arXiv},
+      primaryClass={cs.CV},
+      url={https://arxiv.org/abs/2407.12679},
+}
+@article{ataallah2024minigpt4,
+  title={MiniGPT4-Video: Advancing Multimodal LLMs for Video Understanding with Interleaved Visual-Textual Tokens},
+  author={Ataallah, Kirolos and Shen, Xiaoqian and Abdelrahman, Eslam and Sleiman, Essam and Zhu, Deyao and Ding, Jian and Elhoseiny, Mohamed},
+  journal={arXiv preprint arXiv:2404.03413},
+  year={2024}
+}
+```
+## Acknowledgements
+[MiniGPT4](https://github.com/Vision-CAIR/MiniGPT-4) <br>
+[Video-ChatGPT](https://mbzuai-oryx.github.io/Video-ChatGPT)
+## License
+This repository is under [BSD 3-Clause License](LICENSE.md).
+Many codes are based on [MiniGPT4](https://github.com/Vision-CAIR/MiniGPT-4).

clean_stage3_json.py ADDED Viewed

	@@ -0,0 +1,35 @@

+import json
+import os
+VIDEO_DIR = "datasets/stage3/videos"
+JSON_PATH = "datasets/stage3/video_instruct_data.json"
+OUTPUT_JSON = "datasets/stage3/video_instruct_data_clean.json"
+def main():
+    print("🚀 开始清洗 Stage 3 JSON...")
+    # 1. 扫描本地视频 ID
+    existing_ids = set()
+    for f in os.listdir(VIDEO_DIR):
+        if f.endswith(('.mp4', '.mkv', '.webm')):
+            existing_ids.add(os.path.splitext(f)[0])
+    print(f"✅ 本地视频数: {len(existing_ids)}")
+    # 2. 读取全量 JSON
+    with open(JSON_PATH, 'r') as f:
+        data = json.load(f)
+    # 3. 过滤：只保留本地有的
+    clean_data = []
+    for item in data:
+        # 兼容不同的键名情况
+        vid = item.get("video_id") or item.get("video_name") or item.get("image_id")
+        if vid in existing_ids:
+            clean_data.append(item)
+    # 4. 保存
+    with open(OUTPUT_JSON, 'w') as f:
+        json.dump(clean_data, f)
+    print(f"🎉 清洗完毕！有效数据: {len(clean_data)} 条。已保存至 {OUTPUT_JSON}")
+if __name__ == "__main__":
+    main()

convert_cmd_to_json.py ADDED Viewed

	@@ -0,0 +1,45 @@

+import pandas as pd
+import json
+import os
+# ================= 配置 =================
+BASE_DIR = "datasets"
+METADATA_DIR = os.path.join(BASE_DIR, "CondensedMovies_Metadata")
+VIDEO_DIR = os.path.join(BASE_DIR, "CondensedMovies_Videos")
+OUTPUT_JSON = os.path.join(BASE_DIR, "cmd_annotations.json")
+# ========================================
+def main():
+    print("🚀 生成标准 CMD JSON...")
+    # 1. 读取 CSV
+    df_clips = pd.read_csv(os.path.join(METADATA_DIR, "clips.csv"))
+    df_desc = pd.read_csv(os.path.join(METADATA_DIR, "descriptions.csv"))
+    df_merged = pd.merge(df_clips, df_desc, on="videoid", how="inner")
+    # 2. 扫描本地视频 (现在它们都在根目录了，且都是 mp4)
+    existing_ids = set()
+    for f in os.listdir(VIDEO_DIR):
+        if f.endswith(".mp4"):
+            existing_ids.add(os.path.splitext(f)[0])
+    print(f"✅ 本地找到 {len(existing_ids)} 个视频")
+    # 3. 生成列表
+    annotations = []
+    for _, row in df_merged.iterrows():
+        vid = row['videoid']
+        if vid in existing_ids:
+            # 只要 image_id 和 caption，完全符合原始代码要求
+            annotations.append({
+                "image_id": vid,
+                "caption": row['description']
+            })
+    # 4. 保存
+    with open(OUTPUT_JSON, 'w') as f:
+        json.dump(annotations, f)
+    print(f"🎉 JSON 生成完毕: {len(annotations)} 条数据")
+if __name__ == "__main__":
+    main()

convert_csv_to_json2.py ADDED Viewed

	@@ -0,0 +1,34 @@

+import pandas as pd
+import json
+import os
+# 读取 CSV
+csv_path = 'datasets/stage3/video_instruct_data.csv'
+df = pd.read_csv(csv_path)
+json_data = []
+# 遍历每一行
+for index, row in df.iterrows():
+    # 获取视频ID
+    vid = str(row['video_id']).strip()
+    # 获取问题和答案
+    question = str(row['q']).strip()
+    answer = str(row['a']).strip()
+    # 【关键修改】这里改回代码喜欢的 "q" 和 "a"
+    entry = {
+        "video_id": vid,
+        "q": question,      # 之前写的是 "instruction"，现在改回 "q"
+        "a": answer,        # 之前写的是 "answer"，现在改回 "a"
+        "length": 100
+    }
+    json_data.append(entry)
+# 覆盖保存为 JSON
+output_path = 'datasets/stage3/video_instruct_data.json'
+with open(output_path, 'w', encoding='utf-8') as f:
+    json.dump(json_data, f, indent=4)
+print(f"转换完成！已重新生成符合代码要求的 JSON。")

environment.yml ADDED Viewed

	@@ -0,0 +1,317 @@

+name: goldfish
+channels:
+  - conda-forge
+dependencies:
+  - _libgcc_mutex=0.1=conda_forge
+  - _openmp_mutex=4.5=2_gnu
+  - archspec=0.2.2=pyhd8ed1ab_0
+  - boltons=23.1.1=pyhd8ed1ab_0
+  - brotli-python=1.1.0=py39h3d6467e_1
+  - bzip2=1.0.8=hd590300_5
+  - c-ares=1.25.0=hd590300_0
+  - ca-certificates=2024.2.2=hbcca054_0
+  - certifi=2024.2.2=pyhd8ed1ab_0
+  - cffi=1.16.0=py39h7a31438_0
+  - charset-normalizer=3.3.2=pyhd8ed1ab_0
+  - colorama=0.4.6=pyhd8ed1ab_0
+  - conda=23.11.0=py39hf3d152e_1
+  - conda-libmamba-solver=23.12.0=pyhd8ed1ab_0
+  - conda-package-handling=2.2.0=pyh38be061_0
+  - conda-package-streaming=0.9.0=pyhd8ed1ab_0
+  - cudatoolkit=11.8.0=h4ba93d1_12
+  - cudatoolkit-dev=11.7.0=h1de0b5d_6
+  - distro=1.9.0=pyhd8ed1ab_0
+  - faiss=1.7.4=py39cuda112h460e57a_0_cuda
+  - fmt=10.1.1=h00ab1b0_1
+  - freetype=2.12.1=h267a509_2
+  - gmp=6.1.2=hf484d3e_1000
+  - gnutls=3.5.19=h2a4e5f8_1
+  - icu=73.2=h59595ed_0
+  - idna=3.6=pyhd8ed1ab_0
+  - jsonpatch=1.33=pyhd8ed1ab_0
+  - jsonpointer=2.4=py39hf3d152e_3
+  - keyutils=1.6.1=h166bdaf_0
+  - krb5=1.21.2=h659d440_0
+  - ld_impl_linux-64=2.40=h41732ed_0
+  - libarchive=3.7.2=h2aa1ff5_1
+  - libblas=3.9.0=20_linux64_openblas
+  - libcblas=3.9.0=20_linux64_openblas
+  - libcurl=8.5.0=hca28451_0
+  - libedit=3.1.20191231=he28a2e2_2
+  - libev=4.33=hd590300_2
+  - libfaiss=1.7.4=cuda112hb18a002_0_cuda
+  - libfaiss-avx2=1.7.4=cuda112h1234567_0_cuda
+  - libffi=3.4.2=h7f98852_5
+  - libgcc-ng=13.2.0=h807b86a_3
+  - libgfortran-ng=13.2.0=h69a702a_3
+  - libgfortran5=13.2.0=ha4646dd_3
+  - libgomp=13.2.0=h807b86a_3
+  - libiconv=1.17=hd590300_2
+  - liblapack=3.9.0=20_linux64_openblas
+  - libmamba=1.5.6=had39da4_0
+  - libmambapy=1.5.6=py39h10defb6_0
+  - libnghttp2=1.58.0=h47da74e_1
+  - libnsl=2.0.1=hd590300_0
+  - libopenblas=0.3.25=pthreads_h413a1c8_0
+  - libpng=1.6.39=h753d276_0
+  - libsolv=0.7.27=hfc55251_0
+  - libsqlite=3.44.2=h2797004_0
+  - libssh2=1.11.0=h0841786_0
+  - libstdcxx-ng=13.2.0=h7e041cc_3
+  - libuuid=2.38.1=h0b41bf4_0
+  - libxcrypt=4.4.36=hd590300_1
+  - libxml2=2.12.3=h232c23b_0
+  - libzlib=1.2.13=hd590300_5
+  - lz4-c=1.9.4=hcb278e6_0
+  - lzo=2.10=h516909a_1000
+  - menuinst=2.0.1=py39hf3d152e_0
+  - ncurses=6.4=h59595ed_2
+  - nettle=3.3=0
+  - numpy=1.26.3=py39h474f0d3_0
+  - openh264=1.8.0=hdbcaa40_1000
+  - openssl=3.2.1=hd590300_0
+  - packaging=23.2=pyhd8ed1ab_0
+  - pip=23.3.2=pyhd8ed1ab_0
+  - platformdirs=4.1.0=pyhd8ed1ab_0
+  - pluggy=1.3.0=pyhd8ed1ab_0
+  - pybind11-abi=4=hd8ed1ab_3
+  - pycosat=0.6.6=py39hd1e30aa_0
+  - pycparser=2.21=pyhd8ed1ab_0
+  - pysocks=1.7.1=pyha2e5f31_6
+  - python=3.9.18=h0755675_1_cpython
+  - python_abi=3.9=4_cp39
+  - readline=8.2=h8228510_1
+  - reproc=14.2.4.post0=hd590300_1
+  - reproc-cpp=14.2.4.post0=h59595ed_1
+  - requests=2.31.0=pyhd8ed1ab_0
+  - ruamel.yaml=0.18.5=py39hd1e30aa_0
+  - ruamel.yaml.clib=0.2.7=py39hd1e30aa_2
+  - tk=8.6.13=noxft_h4845f30_101
+  - tqdm=4.66.1=pyhd8ed1ab_0
+  - urllib3=2.1.0=pyhd8ed1ab_0
+  - wheel=0.42.0=pyhd8ed1ab_0
+  - x264=1!152.20180717=h14c3975_1001
+  - xz=5.2.6=h166bdaf_0
+  - yaml-cpp=0.8.0=h59595ed_0
+  - zlib=1.2.13=hd590300_5
+  - zstandard=0.22.0=py39h6e5214e_0
+  - zstd=1.5.5=hfc55251_0
+  - pip:
+      - accelerate==0.25.0
+      - aiofiles==23.2.1
+      - aiohttp==3.9.1
+      - aiosignal==1.3.1
+      - altair==5.2.0
+      - annotated-types==0.6.0
+      - antlr4-python3-runtime==4.9.3
+      - anyio==4.2.0
+      - appdirs==1.4.4
+      - asgiref==3.7.2
+      - async-timeout==4.0.3
+      - attrs==23.2.0
+      - backoff==2.2.1
+      - bcrypt==4.1.2
+      - beautifulsoup4==4.12.2
+      - bitarray==2.9.2
+      - bitsandbytes==0.42.0
+      - bleach==6.1.0
+      - blinker==1.7.0
+      - braceexpand==0.1.7
+      - build==1.0.3
+      - cachetools==5.3.2
+      - chardet==5.2.0
+      - chroma-hnswlib==0.7.3
+      - chromadb==0.4.22
+      - click==8.1.7
+      - cmake==3.25.0
+      - colbert-ai==0.2.18
+      - coloredlogs==15.0.1
+      - contourpy==1.2.0
+      - cycler==0.12.1
+      - datasets==2.17.0
+      - decorator==4.4.2
+      - decord==0.6.0
+      - deprecated==1.2.14
+      - dill==0.3.8
+      - docker-pycreds==0.4.0
+      - docopt==0.6.2
+      - einops==0.7.0
+      - exceptiongroup==1.2.0
+      - faiss-gpu==1.7.2
+      - fastapi==0.108.0
+      - ffmpeg==1.4
+      - ffmpeg-python==0.2.0
+      - ffmpy==0.3.1
+      - filelock==3.13.1
+      - flask==3.0.2
+      - flatbuffers==23.5.26
+      - fonttools==4.47.0
+      - frozenlist==1.4.1
+      - fsspec==2023.10.0
+      - ftfy==6.1.3
+      - future==0.18.3
+      - gdown==4.7.1
+      - git-python==1.0.3
+      - gitdb==4.0.11
+      - gitpython==3.1.40
+      - google-auth==2.26.1
+      - googleapis-common-protos==1.62.0
+      - gradio
+      - gradio-client
+      - h11==0.14.0
+      - h5py==3.10.0
+      - httpcore==1.0.2
+      - httptools==0.6.1
+      - httpx==0.26.0
+      - huggingface-hub
+      - humanfriendly==10.0
+      - imageio==2.33.1
+      - imageio-ffmpeg==0.4.9
+      - importlib-metadata==6.11.0
+      - importlib-resources==6.1.1
+      - inquirerpy==0.3.4
+      - iopath==0.1.10
+      - itsdangerous==2.1.2
+      - jinja2==3.1.2
+      - joblib==1.3.2
+      - jsonschema==4.20.0
+      - jsonschema-specifications==2023.12.1
+      - kaggle==1.6.0
+      - kiwisolver==1.4.5
+      - kubernetes==29.0.0
+      - lazy-loader==0.3
+      - lit==15.0.7
+      - llvmlite==0.41.1
+      - markdown-it-py==3.0.0
+      - matplotlib==3.8.2
+      - mdurl==0.1.2
+      - mmh3==4.1.0
+      - monotonic==1.6
+      - more-itertools==10.1.0
+      - moviepy==1.0.3
+      - mpmath==1.3.0
+      - multidict==6.0.4
+      - multiprocess==0.70.16
+      - mutagen==1.47.0
+      - networkx==3.2.1
+      - ninja==1.11.1.1
+      - nltk==3.8.1
+      - numba==0.58.1
+      - omegaconf==2.3.0
+      - onnxruntime==1.16.3
+      - openai
+      - openai-whisper==20231117
+      - opencv-python==4.7.0.72
+      - opentelemetry-api==1.22.0
+      - opentelemetry-exporter-otlp-proto-common==1.22.0
+      - opentelemetry-exporter-otlp-proto-grpc==1.22.0
+      - opentelemetry-instrumentation==0.43b0
+      - opentelemetry-instrumentation-asgi==0.43b0
+      - opentelemetry-instrumentation-fastapi==0.43b0
+      - opentelemetry-proto==1.22.0
+      - opentelemetry-sdk==1.22.0
+      - opentelemetry-semantic-conventions==0.43b0
+      - opentelemetry-util-http==0.43b0
+      - orjson==3.9.10
+      - overrides==7.4.0
+      - pandas==2.0.0
+      - pathtools==0.1.2
+      - peft==0.2.0
+      - pfzy==0.3.4
+      - pillow==10.2.0
+      - plotly==5.18.0
+      - portalocker==2.8.2
+      - posthog==3.3.0
+      - proglog==0.1.10
+      - progressbar2==4.3.2
+      - prompt-toolkit==3.0.43
+      - protobuf==4.25.1
+      - psutil==5.9.7
+      - pulsar-client==3.4.0
+      - pyarrow==15.0.0
+      - pyarrow-hotfix==0.6
+      - pyasn1==0.5.1
+      - pyasn1-modules==0.3.0
+      - pycocoevalcap==1.2
+      - pycocotools==2.0.6
+      - pycryptodomex==3.19.1
+      - pydantic==2.5.3
+      - pydantic-core==2.14.6
+      - pydub==0.25.1
+      - pygments==2.17.2
+      - pyparsing==3.1.1
+      - pypika==0.48.9
+      - pyproject-hooks==1.0.0
+      - pysrt==1.1.2
+      - python-dateutil==2.8.2
+      - python-dotenv==1.0.0
+      - python-multipart==0.0.6
+      - python-slugify==8.0.1
+      - python-utils==3.8.1
+      - pytubefix==6.5.1
+      - pytz==2023.3.post1
+      - pyyaml==6.0.1
+      - referencing==0.32.0
+      - regex==2023.12.25
+      - rich==13.7.0
+      - rouge==1.0.1
+      - rpds-py==0.16.2
+      - rsa==4.9
+      - safetensors==0.4.1
+      - scikit-image==0.22.0
+      - scikit-learn==1.3.2
+      - scipy==1.11.4
+      - seaborn==0.13.1
+      - semantic-version==2.10.0
+      - sentence-transformers==2.2.2
+      - sentencepiece==0.1.97
+      - sentry-sdk==1.39.1
+      - setproctitle==1.3.3
+      - setuptools==69.0.3
+      - shellingham==1.5.4
+      - six==1.16.0
+      - smmap==5.0.1
+      - sniffio==1.3.0
+      - soundfile==0.12.1
+      - soupsieve==2.5
+      - starlette==0.32.0.post1
+      - sympy==1.12
+      - tenacity==8.2.3
+      - text-unidecode==1.3
+      - threadpoolctl==3.2.0
+      - tifffile==2023.12.9
+      - tiktoken==0.5.2
+      - timm
+      - tokenizers==0.15.2
+      - tomli==2.0.1
+      - tomlkit==0.12.0
+      - toolz==0.12.0
+      - torch==2.2.2
+      - torchaudio==2.2.2
+      - torchvision==0.17.2
+      - transformers
+      #- triton==2.0.0
+      - typer==0.9.0
+      - typing-extensions==4.9.0
+      - tzdata==2023.4
+      - ujson==5.9.0
+      - uvicorn==0.25.0
+      - uvloop==0.19.0
+      - visual-genome==1.1.1
+      - wandb==0.14.2
+      - watchfiles==0.21.0
+      - wcwidth==0.2.13
+      - webdataset==0.2.48
+      - webencodings==0.5.1
+      - websocket-client==1.7.0
+      - websockets
+      - webvtt-py==0.4.6
+      - wrapt==1.16.0
+      - xxhash==3.4.1
+      - yarl==1.9.4
+      - youtube-dl==2021.12.17
+      - yt-dlp
+      - zipp
+      #- vllm
+      #- openai-whisper
+      #- triton==2.0.0

evaluation/Goldfish_eval/movies/eval_model_summary_llama_vid.sh ADDED Viewed

	@@ -0,0 +1,66 @@

+#!/bin/bash
+#SBATCH --partition=batch
+#SBATCH --job-name=L_RAG_general_summary_3_subtitles_together_%j
+#SBATCH --output=L_RAG_general_summary_3_subtitles_together_%j.out
+#SBATCH --error=L_RAG_general_summary_3_subtitles_together_%j.err
+#SBATCH --time=0-23:00:00
+#SBATCH --mem=64G
+#SBATCH --gres=gpu:a100:1
+#SBATCH --nodes=1
+## run the application:
+CKPT_PATH="checkpoints/video_llama_checkpoint_last.pth"
+START=$1
+END=$2
+BATCH_SIZE=4
+NEIGHBOURS=3
+## Dataset paths
+videos_path="path to the videos"
+subtitle_path="path to the subtitles"
+video_clips_saving_path="path to save the video clips"
+annotation_file="path to the annotation file"
+movienet_annotations_dir="path to the movienet annotations directory"
+# if you want to use openai embedding, then you need to set the OPENAI_API_KEY
+use_openai_embedding=True
+export OPENAI_API_KEY="your_openai_key"
+# if start and end are not provided, then use the whole dataset
+if [ -z "$START" ]
+then
+      START=0
+fi
+if [ -z "$END" ]
+then
+      END=100000
+fi
+echo "Start: $START"
+echo "End: $END"
+# # Vision + subtitles
+exp_name="Vsion_subtitles_model_summary_subtitle"
+echo $exp_name
+python evaluation/eval_goldfish_llama_vid.py --index_subtitles_together --neighbours=$NEIGHBOURS  --start=$START --end=$END --batch_size $BATCH_SIZE   --ckpt $CKPT_PATH  --exp_name=$exp_name\
+      --videos_path $videos_path --subtitle_path $subtitle_path --video_clips_saving_path $video_clips_saving_path --annotation_path $annotation_path --movienet_annotations_dir $movienet_annotations_dir --use_openai_embedding $use_openai_embedding
+# vision only
+# exp_name="vision_only"
+# echo $exp_name
+# python eval_goldfish_llama_vid.py --vision_only --model_summary_only --neighbours=$NEIGHBOURS  --start=$START --end=$END --batch_size $BATCH_SIZE  --ckpt $CKPT_PATH  --exp_name=$exp_name\
+#       --videos_path $videos_path --subtitle_path $subtitle_path --video_clips_saving_path $video_clips_saving_path --annotation_path $annotation_path --movienet_annotations_dir $movienet_annotations_dir --use_openai_embedding $use_openai_embedding
+# subtiltes only  (eliminate the vision)
+# exp_name="subtitles_only"
+# echo $exp_name
+# python eval_goldfish_llama_vid.py --index_subtitles_together --subtitles_only --neighbours=$NEIGHBOURS  --start=$START --end=$END --batch_size $BATCH_SIZE  --ckpt $CKPT_PATH  --exp_name=$exp_name\
+#       --videos_path $videos_path --subtitle_path $subtitle_path --video_clips_saving_path $video_clips_saving_path --annotation_path $annotation_path --movienet_annotations_dir $movienet_annotations_dir --use_openai_embedding $use_openai_embedding

evaluation/Goldfish_eval/movies/eval_model_summary_movie_chat.sh ADDED Viewed

	@@ -0,0 +1,44 @@

+#!/bin/bash
+#SBATCH --partition=batch
+#SBATCH --job-name=MC_RAG_general_summary_all_%j
+#SBATCH --output=MC_RAG_general_summary_all_%j.out
+#SBATCH --error=MC_RAG_general_summary_all_%j.err
+#SBATCH --time=0-23:00:00
+#SBATCH --mem=64G
+#SBATCH --gres=gpu:a100:1
+#SBATCH --nodes=1
+## run the application:
+CKPT_PATH="checkpoints/video_llama_checkpoint_last.pth"
+START=$1
+END=$2
+BATCH_SIZE=4
+# if start and end are not provided, then use the whole dataset
+if [ -z "$START" ]
+then
+      START=0
+fi
+if [ -z "$END" ]
+then
+      END=100000
+fi
+echo "Start: $START"
+echo "End: $END"
+NEIGHBOURS=-1 # use the whole neighbourhood for the global mode
+dataset_path="path to the movies folder"
+annotation_json_folder="path to the jsons folder"
+# if you want to use openai embedding, then you need to set the OPENAI_API_KEY
+use_openai_embedding=True
+export OPENAI_API_KEY="your_openai_key"
+exp_name="model_summary_and_subtitle"
+fps=2
+# use general summary
+python evaluation/eval_goldfish_movie_chat.py --fps=$fps --neighbours_global=$NEIGHBOURS  --start=$START --end=$END --batch_size $BATCH_SIZE  --ckpt $CKPT_PATH  --exp_name=$exp_name\
+      --dataset_videos_path $dataset_path --annotation_json_folder $annotation_json_folder --use_openai_embedding $use_openai_embedding

evaluation/Goldfish_eval/movies/eval_model_summary_movie_qa.sh ADDED Viewed

	@@ -0,0 +1,63 @@

+#!/bin/bash
+#SBATCH --partition=batch
+#SBATCH --job-name=M_RAG_general_summary_1_subtitles_together_%j
+#SBATCH --output=M_RAG_general_summary_1_subtitles_together_%j.out
+#SBATCH --error=M_RAG_general_summary_1_subtitles_together_%j.err
+#SBATCH --time=0-23:00:00
+#SBATCH --mem=100G
+#SBATCH --gres=gpu:a100:1
+#SBATCH --nodes=1
+## run the application:
+CKPT_PATH="checkpoints/video_llama_checkpoint_last.pth"
+START=$1
+END=$2
+BATCH_SIZE=4
+NEIGHBOURS=3
+## Dataset paths
+videos_path="path to the videos"
+subtitle_path="path to the subtitles"
+video_clips_saving_path="path to save the video clips"
+annotation_file="path to the annotation file"
+movienet_annotations_dir="path to the movienet annotations directory"
+# if you want to use openai embedding, then you need to set the OPENAI_API_KEY
+use_openai_embedding=True
+export OPENAI_API_KEY="your_openai_key"
+# if start and end are not provided, then use the whole dataset
+if [ -z "$START" ]
+then
+      START=0
+fi
+if [ -z "$END" ]
+then
+      END=100000
+fi
+echo "Start: $START"
+echo "End: $END"
+echo "Batch size: $BATCH_SIZE"
+# # Vision + subtitles
+exp_name="Vsion_subtitles_model_summary_subtitle"
+echo $exp_name
+python evaluation/eval_goldfish_movie_qa.py --add_unknown --index_subtitles_together --neighbours=$NEIGHBOURS  --start=$START --end=$END --batch_size $BATCH_SIZE --ckpt $CKPT_PATH  --exp_name=$exp_name\
+      --videos_path $videos_path --subtitle_path $subtitle_path --video_clips_saving_path $video_clips_saving_path --annotation_path $annotation_path --movienet_annotations_dir $movienet_annotations_dir --use_openai_embedding $use_openai_embedding
+# vision only
+# exp_name="vision_only"
+# echo $exp_name
+# python eval_goldfish_movie_qa.py --add_unknown --vision_only --model_summary_only --neighbours=$NEIGHBOURS  --start=$START --end=$END --batch_size $BATCH_SIZE  --ckpt $CKPT_PATH  --exp_name=$exp_name\
+#       --videos_path $videos_path --subtitle_path $subtitle_path --video_clips_saving_path $video_clips_saving_path --annotation_path $annotation_path --movienet_annotations_dir $movienet_annotations_dir --use_openai_embedding $use_openai_embedding
+# subtiltes only  (eliminate the vision)
+# exp_name="subtitles_only"
+# echo $exp_name
+# python eval_goldfish_movie_qa.py --add_unknown --index_subtitles_together --subtitles_only --neighbours=$NEIGHBOURS  --start=$START --end=$END --batch_size $BATCH_SIZE  --name $NAME --ckpt $CKPT_PATH  --exp_name=$exp_name\
+#       --videos_path $videos_path --subtitle_path $subtitle_path --video_clips_saving_path $video_clips_saving_path --annotation_path $annotation_path --movienet_annotations_dir $movienet_annotations_dir --use_openai_embedding $use_openai_embedding

evaluation/Goldfish_eval/movies/eval_q_related_info_llama_vid.sh ADDED Viewed

	@@ -0,0 +1,57 @@

+#!/bin/bash
+#SBATCH --partition=batch
+#SBATCH --job-name=job_name%j
+#SBATCH --output=job_name%j.out
+#SBATCH --error=job_name%j.err
+#SBATCH --time=0-23:00:00
+#SBATCH --mem=64G
+#SBATCH --gres=gpu:a100:1
+#SBATCH --nodes=1
+## run the application:
+CKPT_PATH="checkpoints/video_llama_checkpoint_last.pth"
+BATCH_SIZE=4
+START=$1
+END=$2
+NEIGHBOURS=3
+# Dataset paths
+videos_path="path to the videos"
+subtitle_path="path to the subtitles"
+video_clips_saving_path="path to save the video clips"
+annotation_file="path to the annotation file"
+movienet_annotations_dir="path to the movienet annotations directory"
+# if you want to use openai embedding, then you need to set the OPENAI_API_KEY
+use_openai_embedding=True
+export OPENAI_API_KEY="your_openai_key"
+# if start and end are not provided, then use the whole dataset
+if [ -z "$START" ]
+then
+      START=0
+fi
+if [ -z "$END" ]
+then
+      END=100000
+fi
+echo "Start: $START"
+echo "End: $END"
+# # Vision + subtitles
+exp_name="Vsion_subtitles_model_summary_subtitle"
+echo $exp_name
+python evaluation/eval_goldfish_llama_vid.py --use_clips_for_info  --index_subtitles_together --neighbours=$NEIGHBOURS  --start=$START --end=$END --batch_size $BATCH_SIZE  --ckpt $CKPT_PATH  --exp_name=$exp_name\
+      --videos_path $videos_path --subtitle_path $subtitle_path --video_clips_saving_path $video_clips_saving_path --annotation_path $annotation_path --movienet_annotations_dir $movienet_annotations_dir --use_openai_embedding $use_openai_embedding
+# vision only
+# exp_name="vision_only"
+# echo $exp_name
+# python evaluation/eval_goldfish_llama_vid.py --use_clips_for_info --vision_only --model_summary_only --neighbours=$NEIGHBOURS  --start=$START --end=$END --batch_size $BATCH_SIZE  --ckpt $CKPT_PATH  --exp_name=$exp_name\
+#       --videos_path $videos_path --subtitle_path $subtitle_path --video_clips_saving_path $video_clips_saving_path --annotation_path $annotation_path --movienet_annotations_dir $movienet_annotations_dir --use_openai_embedding $use_openai_embedding
+# # subtiltes only  (eliminate the vision)
+# it is only from summaries no need to run it with clips

evaluation/Goldfish_eval/movies/eval_q_related_info_movie_chat.sh ADDED Viewed

	@@ -0,0 +1,42 @@

+#!/bin/bash
+#SBATCH --partition=batch
+#SBATCH --job-name=job_name%j
+#SBATCH --output=job_name%j.out
+#SBATCH --error=job_name%j.err
+#SBATCH --time=0-23:00:00
+#SBATCH --mem=64G
+#SBATCH --gres=gpu:a100:1
+#SBATCH --nodes=1
+## run the application:
+CKPT_PATH="checkpoints/video_llama_checkpoint_last.pth"
+BATCH_SIZE=4
+START=$1
+END=$2
+# if start and end are not provided, then use the whole dataset
+if [ -z "$START" ]
+then
+      START=0
+fi
+if [ -z "$END" ]
+then
+      END=100000
+fi
+echo "Start: $START"
+echo "End: $END"
+NEIGHBOURS=-1 # use the whole neighbourhood for the global mode
+dataset_path="path to the movies folder"
+annotation_json_folder="path to the jsons folder"
+# if you want to use openai embedding, then you need to set the OPENAI_API_KEY
+use_openai_embedding=True
+export OPENAI_API_KEY="your_openai_key"
+exp_name="model_summary_and_subtitle"
+fps=2
+# use this for both info and general summary --v_sum_and_info
+python evaluation/eval_goldfish_movie_chat.py  --fps=$fps --neighbours_global=$NEIGHBOURS --batch_size=$BATCH_SIZE --start=$START --end=$END  --use_clips_for_info --ckpt $CKPT_PATH   --exp_name=$exp_name --dataset_videos_path $dataset_path --annotation_json_folder $annotation_json_folder --use_openai_embedding $use_openai_embedding

evaluation/Goldfish_eval/movies/eval_q_related_info_movie_qa.sh ADDED Viewed

	@@ -0,0 +1,57 @@

+#!/bin/bash
+#SBATCH --partition=batch
+#SBATCH --job-name=M_RAG_clips_for_info_3_subtitles_together_%j
+#SBATCH --output=M_RAG_clips_for_info_3_subtitles_together_%j.out
+#SBATCH --error=M_RAG_clips_for_info_3_subtitles_together_%j.err
+#SBATCH --time=0-23:00:00
+#SBATCH --mem=64G
+#SBATCH --gres=gpu:a100:1
+#SBATCH --nodes=1
+## run the application:
+NAME="ckpt_92"
+CKPT_PATH="checkpoints/video_llama_checkpoint_last.pth"
+BATCH_SIZE=4
+START=$1
+END=$2
+NEIGHBOURS=3
+# Dataset paths
+videos_path="path to the videos"
+subtitle_path="path to the subtitles"
+video_clips_saving_path="path to save the video clips"
+annotation_file="path to the annotation file"
+movienet_annotations_dir="path to the movienet annotations directory"
+# if you want to use openai embedding, then you need to set the OPENAI_API_KEY
+use_openai_embedding=True
+export OPENAI_API_KEY="your_openai_key"
+# if start and end are not provided, then use the whole dataset
+if [ -z "$START" ]
+then
+      START=0
+fi
+if [ -z "$END" ]
+then
+      END=100000
+fi
+echo "Start: $START"
+echo "End: $END"
+echo "Batch size: $BATCH_SIZE"
+# # Vision + subtitles
+# exp_name="Vsion_subtitles_model_summary_subtitle"
+# echo $exp_name
+python evaluation/eval_goldfish_movie_qa.py --add_unknown --use_clips_for_info --use_choices_for_info --index_subtitles_together --neighbours=$NEIGHBOURS  --start=$START --end=$END --batch_size $BATCH_SIZE --ckpt $CKPT_PATH  --exp_name=$exp_name\
+      --videos_path $videos_path --subtitle_path $subtitle_path --video_clips_saving_path $video_clips_saving_path --annotation_path $annotation_path --movienet_annotations_dir $movienet_annotations_dir --use_openai_embedding $use_openai_embedding
+# vision only
+# exp_name="vision_only"
+# echo $exp_name
+# python evaluation/eval_goldfish_movie_qa.py --add_unknown --use_clips_for_info --use_choices_for_info --vision_only --model_summary_only --neighbours=$NEIGHBOURS  --start=$START --end=$END --batch_size $BATCH_SIZE --ckpt $CKPT_PATH  --exp_name=$exp_name\
+#       --videos_path $videos_path --subtitle_path $subtitle_path --video_clips_saving_path $video_clips_saving_path --annotation_path $annotation_path --movienet_annotations_dir $movienet_annotations_dir --use_openai_embedding $use_openai_embedding

evaluation/Goldfish_eval/movies/submit_batch_jobs_llama_vid.py ADDED Viewed

	@@ -0,0 +1,14 @@

+import os
+# bash_script = 'eval_q_related_info_llama_vid.sh'
+bash_script = 'eval_model_summary_llama_vid.sh'
+start=0
+end=45
+step=11
+for i in range(start, end, step):
+    # print(i, i+step, job_id)
+    # job_id+=1
+    cmd=f'sbatch {bash_script} {str(i)} {str(i+step)}'
+    # print(cmd)
+    os.system(cmd)

evaluation/Goldfish_eval/movies/submit_batch_jobs_movie_qa.py ADDED Viewed

	@@ -0,0 +1,16 @@

+import os
+import sys
+bash_script = 'eval_model_summary_movie_qa.sh'
+# bash_script = 'eval_q_related_info_movie_qa.sh'
+start=0
+end=30
+step=4
+for i in range(start, end, step):
+    # print(i, i+step, job_id)
+    # job_id+=1
+    cmd=f'sbatch {bash_script} {str(i)} {str(i+step)}'
+    # print(cmd)
+    os.system(cmd)

evaluation/Goldfish_eval/movies/submit_batch_jobs_moviechat.py ADDED Viewed

	@@ -0,0 +1,14 @@

+import os
+bash_script = 'eval_q_related_info_movie_chat.sh'
+# bash_script = 'eval_model_summary_movie_chat.sh'
+start=0
+end=101
+step=26
+for i in range(start, end, step):
+    # print(i, i+step, job_id)
+    # job_id+=1
+    cmd=f'sbatch {bash_script} {str(i)} {str(i+step)}'
+    # print(cmd)
+    os.system(cmd)

evaluation/Goldfish_eval/retrival_accuracy/eval_retrieval_acc_tvqa_job.sh ADDED Viewed

	@@ -0,0 +1,51 @@

+#!/bin/bash
+#SBATCH --partition=batch
+#SBATCH --job-name=Retrieval_acc_3_%j
+#SBATCH --output=Retrieval_acc_3_%j.out
+#SBATCH --error=Retrieval_acc_3_%j.err
+#SBATCH --time=0-23:00:00
+#SBATCH --mem=100G
+#SBATCH --gres=gpu:a100:1
+#SBATCH --nodes=1
+## run the application:
+cd ../../../
+NAME="ckpt_92"
+CKPT_PATH="checkpoints/video_llama_checkpoint_last.pth"
+START=$1
+END=$2
+BATCH_SIZE=8
+# if start and end are not provided, then use the whole dataset
+if [ -z "$START" ]
+then
+      START=0
+fi
+if [ -z "$END" ]
+then
+      END=100000
+fi
+echo "Start: $START"
+echo "End: $END"
+echo "Batch size: $BATCH_SIZE"
+NEIGHBOURS=1
+exp_name="vision"
+python evaluation/eval_retrieval_acc_tvqa.py  --start=$START --end=$END --neighbours=$NEIGHBOURS --batch_size=$BATCH_SIZE  --name $NAME --ckpt $CKPT_PATH --exp_name=$exp_name
+# python evaluation/eval_retrieval_acc_tvqa.py  --vision_only --start=$START --end=$END --neighbours=$NEIGHBOURS --batch_size=$BATCH_SIZE  --name $NAME --ckpt $CKPT_PATH --exp_name=$exp_name
+# python evaluation/eval_retrieval_acc_tvqa.py  --subtitles_only --start=$START --end=$END --neighbours=$NEIGHBOURS --batch_size=$BATCH_SIZE  --name $NAME --ckpt $CKPT_PATH --exp_name=$exp_name
+# exp_name="subtitles"
+# python evaluation/eval_retrieval_acc_tvqa.py  --start=$START --end=$END --neighbours=$NEIGHBOURS --batch_size=$BATCH_SIZE  --name $NAME --ckpt $CKPT_PATH --exp_name=$exp_name
+# python evaluation/eval_retrieval_acc_tvqa.py  --vision_only --start=$START --end=$END --neighbours=$NEIGHBOURS --batch_size=$BATCH_SIZE  --name $NAME --ckpt $CKPT_PATH --exp_name=$exp_name
+# python evaluation/eval_retrieval_acc_tvqa.py  --subtitles_only --start=$START --end=$END --neighbours=$NEIGHBOURS --batch_size=$BATCH_SIZE  --name $NAME --ckpt $CKPT_PATH --exp_name=$exp_name

evaluation/Goldfish_eval/retrival_accuracy/eval_retrieval_acc_tvqa_job_sub_v.sh ADDED Viewed

	@@ -0,0 +1,50 @@

+#!/bin/bash
+#SBATCH --partition=batch
+#SBATCH --job-name=Retrieval_acc_3_%j
+#SBATCH --output=Retrieval_acc_3_%j.out
+#SBATCH --error=Retrieval_acc_3_%j.err
+#SBATCH --time=0-23:00:00
+#SBATCH --mem=100G
+#SBATCH --gres=gpu:a100:1
+#SBATCH --nodes=1
+## run the application:
+NAME="ckpt_92"
+CKPT_PATH="checkpoints/video_llama_checkpoint_last.pth"
+START=$1
+END=$2
+BATCH_SIZE=8
+# if start and end are not provided, then use the whole dataset
+if [ -z "$START" ]
+then
+      START=0
+fi
+if [ -z "$END" ]
+then
+      END=100000
+fi
+echo "Start: $START"
+echo "End: $END"
+echo "Batch size: $BATCH_SIZE"
+NEIGHBOURS=1
+# exp_name="vision"
+# python evaluation/eval_retrieval_acc_tvqa.py  --start=$START --end=$END --neighbours=$NEIGHBOURS --batch_size=$BATCH_SIZE  --name $NAME --ckpt $CKPT_PATH --exp_name=$exp_name
+# python evaluation/eval_retrieval_acc_tvqa.py --vision_only --start=$START --end=$END --neighbours=$NEIGHBOURS --batch_size=$BATCH_SIZE  --name $NAME --ckpt $CKPT_PATH --exp_name=$exp_name
+# python evaluation/eval_retrieval_acc_tvqa.py --subtitles_only --start=$START --end=$END --neighbours=$NEIGHBOURS --batch_size=$BATCH_SIZE  --name $NAME --ckpt $CKPT_PATH --exp_name=$exp_name
+exp_name="subtitles"
+# python evaluation/eval_retrieval_acc_tvqa.py  --start=$START --end=$END --neighbours=$NEIGHBOURS --batch_size=$BATCH_SIZE  --name $NAME --ckpt $CKPT_PATH --exp_name=$exp_name
+python evaluation/eval_retrieval_acc_tvqa.py --vision_only --start=$START --end=$END --neighbours=$NEIGHBOURS --batch_size=$BATCH_SIZE  --name $NAME --ckpt $CKPT_PATH --exp_name=$exp_name
+# python evaluation/eval_retrieval_acc_tvqa.py --subtitles_only --start=$START --end=$END --neighbours=$NEIGHBOURS --batch_size=$BATCH_SIZE  --name $NAME --ckpt $CKPT_PATH --exp_name=$exp_name

evaluation/Goldfish_eval/retrival_accuracy/eval_retrieval_acc_tvqa_job_sub_v_sub.sh ADDED Viewed

	@@ -0,0 +1,51 @@

+#!/bin/bash
+#SBATCH --partition=batch
+#SBATCH --job-name=Retrieval_acc_3_%j
+#SBATCH --output=Retrieval_acc_3_%j.out
+#SBATCH --error=Retrieval_acc_3_%j.err
+#SBATCH --time=0-23:00:00
+#SBATCH --mem=100G
+#SBATCH --gres=gpu:a100:1
+#SBATCH --nodes=1
+## run the application:
+NAME="ckpt_92"
+CKPT_PATH="checkpoints/video_llama_checkpoint_last.pth"
+START=$1
+END=$2
+BATCH_SIZE=8
+# if start and end are not provided, then use the whole dataset
+if [ -z "$START" ]
+then
+      START=0
+fi
+if [ -z "$END" ]
+then
+      END=100000
+fi
+echo "Start: $START"
+echo "End: $END"
+echo "Batch size: $BATCH_SIZE"
+NEIGHBOURS=1
+# exp_name="vision"
+# python evaluation/eval_retrieval_acc_tvqa.py  --start=$START --end=$END --neighbours=$NEIGHBOURS --batch_size=$BATCH_SIZE  --name $NAME --ckpt $CKPT_PATH --exp_name=$exp_name
+# python evaluation/eval_retrieval_acc_tvqa.py  --vision_only --start=$START --end=$END --neighbours=$NEIGHBOURS --batch_size=$BATCH_SIZE  --name $NAME --ckpt $CKPT_PATH --exp_name=$exp_name
+# python evaluation/eval_retrieval_acc_tvqa.py  --subtitles_only --start=$START --end=$END --neighbours=$NEIGHBOURS --batch_size=$BATCH_SIZE  --name $NAME --ckpt $CKPT_PATH --exp_name=$exp_name
+exp_name="subtitles"
+python evaluation/eval_retrieval_acc_tvqa.py  --start=$START --end=$END --neighbours=$NEIGHBOURS --batch_size=$BATCH_SIZE  --name $NAME --ckpt $CKPT_PATH --exp_name=$exp_name
+# python evaluation/eval_retrieval_acc_tvqa.py  --vision_only --start=$START --end=$END --neighbours=$NEIGHBOURS --batch_size=$BATCH_SIZE  --name $NAME --ckpt $CKPT_PATH --exp_name=$exp_name
+# python evaluation/eval_retrieval_acc_tvqa.py  --subtitles_only --start=$START --end=$END --neighbours=$NEIGHBOURS --batch_size=$BATCH_SIZE  --name $NAME --ckpt $CKPT_PATH --exp_name=$exp_name

evaluation/Goldfish_eval/retrival_accuracy/eval_retrieval_acc_tvqa_job_vision_vision.sh ADDED Viewed

	@@ -0,0 +1,51 @@

+#!/bin/bash
+#SBATCH --partition=batch
+#SBATCH --job-name=Retrieval_acc_3_%j
+#SBATCH --output=Retrieval_acc_3_%j.out
+#SBATCH --error=Retrieval_acc_3_%j.err
+#SBATCH --time=0-23:00:00
+#SBATCH --mem=100G
+#SBATCH --gres=gpu:a100:1
+#SBATCH --nodes=1
+## run the application:
+cd ../../../
+NAME="ckpt_92"
+CKPT_PATH="checkpoints/video_llama_checkpoint_last.pth"
+START=$1
+END=$2
+BATCH_SIZE=8
+# if start and end are not provided, then use the whole dataset
+if [ -z "$START" ]
+then
+      START=0
+fi
+if [ -z "$END" ]
+then
+      END=100000
+fi
+echo "Start: $START"
+echo "End: $END"
+echo "Batch size: $BATCH_SIZE"
+NEIGHBOURS=1
+exp_name="vision"
+# python evaluation/eval_retrieval_acc_tvqa.py  --start=$START --end=$END --neighbours=$NEIGHBOURS --batch_size=$BATCH_SIZE  --name $NAME --ckpt $CKPT_PATH --exp_name=$exp_name
+python evaluation/eval_retrieval_acc_tvqa.py  --vision_only --start=$START --end=$END --neighbours=$NEIGHBOURS --batch_size=$BATCH_SIZE  --name $NAME --ckpt $CKPT_PATH --exp_name=$exp_name
+# python evaluation/eval_retrieval_acc_tvqa.py  --subtitles_only --start=$START --end=$END --neighbours=$NEIGHBOURS --batch_size=$BATCH_SIZE  --name $NAME --ckpt $CKPT_PATH --exp_name=$exp_name
+# exp_name="subtitles"
+# python evaluation/eval_retrieval_acc_tvqa.py  --start=$START --end=$END --neighbours=$NEIGHBOURS --batch_size=$BATCH_SIZE  --name $NAME --ckpt $CKPT_PATH --exp_name=$exp_name
+# python evaluation/eval_retrieval_acc_tvqa.py  --vision_only --start=$START --end=$END --neighbours=$NEIGHBOURS --batch_size=$BATCH_SIZE  --name $NAME --ckpt $CKPT_PATH --exp_name=$exp_name
+# python evaluation/eval_retrieval_acc_tvqa.py  --subtitles_only --start=$START --end=$END --neighbours=$NEIGHBOURS --batch_size=$BATCH_SIZE  --name $NAME --ckpt $CKPT_PATH --exp_name=$exp_name

evaluation/Goldfish_eval/tvqa_eval/eval_model_summary.sh ADDED Viewed

	@@ -0,0 +1,59 @@

+#!/bin/bash
+#SBATCH --partition=batch
+#SBATCH --job-name=job_name%j
+#SBATCH --output=job_name%j.out
+#SBATCH --error=job_name%j.err
+#SBATCH --time=0-23:00:00
+#SBATCH --mem=64G
+#SBATCH --gres=gpu:a100:1
+#SBATCH --nodes=1
+## run the application:
+cd ../../../
+CKPT_PATH="checkpoints/video_llama_checkpoint_last.pth"
+START=$1
+END=$2
+BATCH_SIZE=4
+NEIGHBOURS=3
+# tvqa_json_subtitles="path to the tvqa json subtitles file"
+# tvqa_clips_subtitles="path to the tvqa clips subtitles"
+# videos_frames="path to the video frames"
+# annotation_path="path to the TVQA-Long annotation file"
+tvqa_json_subtitles="datasets/evaluation_datasets/goldfish_eval_datasets/tvqa/tvqa_preprocessed_subtitles.json"
+tvqa_clips_subtitles="/ibex/project/c2090/datasets/TVR_dataset/videos/tvqa_subtitles"
+videos_frames="/ibex/project/c2090/datasets/TVR_dataset/videos/video_files/frames_hq/"
+annotation_path="datasets/evaluation_datasets/goldfish_eval_datasets/tvqa/tvqa_val_edited.json"
+# if start and end are not provided, then use the whole dataset
+if [ -z "$START" ]
+then
+      START=0
+fi
+if [ -z "$END" ]
+then
+      END=100000
+fi
+echo "Start: $START"
+echo "End: $END"
+# # Vision + subtitles
+exp_name="Vsion_subtitles_model_summary_subtitle_videoLLM"
+echo $exp_name
+python eval_goldfish_tvqa_long.py --add_unknown --index_subtitles_together --neighbours=$NEIGHBOURS  --start=$START --end=$END --batch_size $BATCH_SIZE  --ckpt $CKPT_PATH  --exp_name=$exp_name\
+      --tvqa_json_subtitles $tvqa_json_subtitles --tvqa_clips_subtitles $tvqa_clips_subtitles --videos_frames $videos_frames --annotation_path $annotation_path
+# vision only
+# exp_name="vision_only"
+# echo $exp_name
+# python eval_goldfish_tvqa_long.py --add_unknown --vision_only --model_summary_only --neighbours=$NEIGHBOURS  --start=$START --end=$END --batch_size $BATCH_SIZE  --name $NAME --ckpt $CKPT_PATH  --exp_name=$exp_name
+# # subtiltes only  (eliminate the vision)
+# exp_name="subtitles_only"
+# echo $exp_name
+# python eval_goldfish_tvqa_long.py --add_unknown --index_subtitles_together --subtitles_only --neighbours=$NEIGHBOURS  --start=$START --end=$END --batch_size $BATCH_SIZE  --name $NAME --ckpt $CKPT_PATH  --exp_name=$exp_name

evaluation/Goldfish_eval/tvqa_eval/eval_q_related_info.sh ADDED Viewed

	@@ -0,0 +1,71 @@

+#!/bin/bash
+#SBATCH --partition=batch
+#SBATCH --job-name=RAG_clips_info_1_vision_%j
+#SBATCH --output=RAG_clips_info_1_vision_%j.out
+#SBATCH --error=RAG_clips_info_1_vision_%j.err
+#SBATCH --time=0-23:00:00
+#SBATCH --mem=64G
+#SBATCH --gres=gpu:a100:1
+#SBATCH --nodes=1
+## run the application:
+cd ../../../
+START=$1
+END=$2
+BATCH_SIZE=4
+NEIGHBOURS=3
+CKPT_PATH="checkpoints/video_llama_checkpoint_last.pth"
+# tvqa_json_subtitles="path to the tvqa json subtitles file"
+# tvqa_clips_subtitles="path to the tvqa clips subtitles"
+# videos_frames="path to the video frames"
+# annotation_path="path to the TVQA-Long annotation file"
+tvqa_json_subtitles="datasets/evaluation_datasets/goldfish_eval_datasets/tvqa/tvqa_preprocessed_subtitles.json"
+tvqa_clips_subtitles="/ibex/project/c2090/datasets/TVR_dataset/videos/tvqa_subtitles"
+videos_frames="/ibex/project/c2090/datasets/TVR_dataset/videos/video_files/frames_hq/"
+annotation_path="datasets/evaluation_datasets/goldfish_eval_datasets/tvqa/tvqa_val_edited.json"
+# if start and end are not provided, then use the whole dataset
+if [ -z "$START" ]
+then
+      START=0
+fi
+if [ -z "$END" ]
+then
+      END=100000
+fi
+echo "Start: $START"
+echo "End: $END"
+# # Vision + subtitles
+exp_name="Vsion_subtitles_model_summary_subtitle"
+echo $exp_name
+python eval_goldfish_tvqa_long.py --add_unknown --use_clips_for_info --use_choices_for_info --index_subtitles_together --neighbours=$NEIGHBOURS  --start=$START --end=$END --batch_size $BATCH_SIZE  --ckpt $CKPT_PATH  --exp_name=$exp_name\
+      --tvqa_json_subtitles $tvqa_json_subtitles --tvqa_clips_subtitles $tvqa_clips_subtitles --videos_frames $videos_frames --annotation_path $annotation_path
+# exp_name="Vsion_subtitles_info_only"
+# echo $exp_name
+# python eval_goldfish_tvqa_long.py --add_unknown --info_only --use_clips_for_info --use_choices_for_info --index_subtitles_together --neighbours=$NEIGHBOURS  --start=$START --end=$END --batch_size $BATCH_SIZE  --ckpt $CKPT_PATH  --exp_name=$exp_name\
+#       --tvqa_json_subtitles $tvqa_json_subtitles --tvqa_clips_subtitles $tvqa_clips_subtitles --videos_frames $videos_frames --annotation_path $annotation_path
+# exp_name="info_sub_after_retrieval"
+# echo $exp_name
+# python eval_goldfish_tvqa_long.py --add_unknown --subtitles_only_after_retrieval --use_clips_for_info --use_choices_for_info --index_subtitles_together --neighbours=$NEIGHBOURS  --start=$START --end=$END --batch_size $BATCH_SIZE  --ckpt $CKPT_PATH  --exp_name=$exp_name\
+#       --tvqa_json_subtitles $tvqa_json_subtitles --tvqa_clips_subtitles $tvqa_clips_subtitles --videos_frames $videos_frames --annotation_path $annotation_path
+# vision only
+# exp_name="vision_only"
+# echo $exp_name
+# python eval_goldfish_tvqa_long.py --add_unknown --use_clips_for_info --use_choices_for_info --vision_only --model_summary_only --neighbours=$NEIGHBOURS  --start=$START --end=$END --batch_size $BATCH_SIZE  --ckpt $CKPT_PATH  --exp_name=$exp_name\
+#       --tvqa_json_subtitles $tvqa_json_subtitles --tvqa_clips_subtitles $tvqa_clips_subtitles --videos_frames $videos_frames --annotation_path $annotation_path

evaluation/Goldfish_eval/tvqa_eval/submit_batch_jobs.py ADDED Viewed

	@@ -0,0 +1,25 @@

+import os
+import sys
+bash_script = 'RAG_summary.sh'
+# bash_script = 'RAG.sh'
+# general
+start=0
+end=850
+step=60
+# bash_script="RAG_summary_R_ablations.sh"
+# sample 50
+# start=0
+# end=52
+# step=6
+# job_id=32434597
+for i in range(start, end, step):
+    # print(i, i+step, job_id)
+    # job_id+=1
+    cmd=f'sbatch {bash_script} {str(i)} {str(i+step)}'
+    os.system(cmd)

evaluation/eval_goldfish_llama_vid.py ADDED Viewed

	@@ -0,0 +1,616 @@

+import sys
+import os
+project_dir = os.getcwd()
+sys.path.append(project_dir)
+import json
+from tqdm import tqdm
+from goldfish_lv import GoldFish_LV,split_subtitles,time_to_seconds
+import argparse
+import json
+import torch
+import re
+from tqdm import tqdm
+from PIL import Image
+from index import MemoryIndex
+import torch
+import random
+import numpy as np
+import torch.backends.cudnn as cudnn
+import shutil
+def str2bool(v):
+    if isinstance(v, bool):
+        return v
+    if v.lower() in ('yes', 'true', 't', 'y', '1'):
+        return True
+    elif v.lower() in ('no', 'false', 'f', 'n', '0'):
+        return False
+    else:
+        raise argparse.ArgumentTypeError('Boolean value expected.')
+def get_arguments():
+    parser = argparse.ArgumentParser(description="Inference parameters")
+    parser.add_argument("--neighbours", type=int, default=-1)
+    parser.add_argument("--name", type=str,default="ckpt_92",help="name of the experiment")
+    parser.add_argument("--add_unknown", action='store_true')
+    parser.add_argument("--use_chatgpt", action='store_true')
+    parser.add_argument("--use_choices_for_info", action='store_true')
+    parser.add_argument("--use_gt_information", action='store_true')
+    parser.add_argument("--inference_text", action='store_true')
+    parser.add_argument("--use_gt_information_with_distraction", action='store_true')
+    parser.add_argument("--num_distraction", type=int, default=2)
+    parser.add_argument("--add_confidance_score", action='store_true')
+    parser.add_argument("--use_original_video", action='store_true')
+    parser.add_argument("--use_video_embedding", action='store_true')
+    parser.add_argument("--use_clips_for_info", action='store_true')
+    parser.add_argument("--use_GT_video", action='store_true')
+    parser.add_argument("--use_gt_summary", action='store_true')
+    parser.add_argument("--index_subtitles", action='store_true')
+    parser.add_argument("--index_subtitles_together", action='store_true')
+    parser.add_argument("--ask_the_question_early", action='store_true')
+    parser.add_argument("--clip_in_ask_early", action='store_true')
+    parser.add_argument("--summary_with_subtitles_only", action='store_true')
+    parser.add_argument("--use_coherent_description", action='store_true')
+    parser.add_argument("--start", default=0, type=int)
+    parser.add_argument("--end", default=100000, type=int)
+    parser.add_argument("--exp_name", type=str,default="",help="name of eval folder")
+    parser.add_argument("--vision_only", action='store_true')
+    parser.add_argument("--model_summary_only", action='store_true')
+    parser.add_argument("--subtitles_only", action='store_true')
+    parser.add_argument("--info_only", action='store_true')
+    parser.add_argument("--cfg-path", default="test_configs/llama2_test_config.yaml")
+    parser.add_argument("--ckpt", type=str, default="checkpoints/video_llama_checkpoint_last.pth")
+    parser.add_argument("--add_subtitles", action='store_true')
+    parser.add_argument("--eval_opt", type=str, default='all')
+    parser.add_argument("--max_new_tokens", type=int, default=300)
+    parser.add_argument("--batch_size", type=int, default=8)
+    parser.add_argument("--lora_r", type=int, default=64)
+    parser.add_argument("--lora_alpha", type=int, default=16)
+    parser.add_argument("--video_path", type=str, help="path to the video")
+    parser.add_argument("--use_openai_embedding",type=str2bool, default=False)
+    parser.add_argument("--annotation_path", type=str, help="path to the annotation file")
+    parser.add_argument("--videos_path", type=str, help="path to the videos directory")
+    parser.add_argument("--subtitle_path", type=str, help="path to the subtitles directory")
+    parser.add_argument("--movienet_annotations_dir", type=str, help="path to the movienet annotations directory")
+    parser.add_argument("--video_clips_saving_path", type=str, help="path to save the splitted small video clips")
+    parser.add_argument("--save_path", type=str, help="path to save the results")
+    parser.add_argument("--options", nargs="+")
+    return parser.parse_args()
+def time_to_seconds(subrip_time):
+    return subrip_time.hours * 3600 + subrip_time.minutes * 60 + subrip_time.seconds + subrip_time.milliseconds / 1000
+def clean_text(subtitles_text):
+    # Remove unwanted characters except for letters, digits, and single quotes
+    subtitles_text = re.sub(r'[^a-zA-Z0-9\s\']', '', subtitles_text)
+    # Replace multiple spaces with a single space
+    subtitles_text = re.sub(r'\s+', ' ', subtitles_text)
+    return subtitles_text.strip()
+class LlamaVidQAEval (GoldFish_LV):
+    def __init__(self,args):
+        super().__init__(args)
+        self.save_json_path = "new_workspace/clips_summary/movienet"
+        if args.use_openai_embedding:
+            self.save_pkls_path = "new_workspace/open_ai_embedding/movienet"
+        else:
+            self.save_pkls_path = "new_workspace/embedding/movienet"
+        os.makedirs(self.save_json_path, exist_ok=True)
+        annotation_path=args.annotation_path
+        with open(annotation_path, 'r') as f:
+            self.movies_dict = json.load(f)
+        self.max_sub_len=400
+        self.max_num_images=45
+    def _get_movie_data(self,videoname):
+        video_images_path =f"{args.videos_path}/{videoname}"
+        movie_clips_path =f"{args.video_clips_saving_path}/{videoname}"
+        subtitle_path = f"{args.subtitle_path}/{videoname}.srt"
+        annotation_file=f"{args.movienet_annotations_dir}/{videoname}.json"
+        # load the annotation file
+        with open(annotation_file, 'r') as f:
+            movie_annotation = json.load(f)
+        return video_images_path,subtitle_path,movie_annotation,movie_clips_path
+    def _store_subtitles_paragraphs(self,subtitle_path,important_data,number_of_paragraphs):
+        paragraphs=[]
+        movie_name=subtitle_path.split('/')[-1].split('.')[0]
+        # if there is no story, split the subtitles into paragraphs
+        paragraphs = split_subtitles(subtitle_path, number_of_paragraphs)
+        for i,paragraph in enumerate(paragraphs):
+            paragraph=clean_text(paragraph)
+            important_data.update({f"subtitle_{i}__{movie_name}_clip_{str(i).zfill(2)}": paragraph})
+        return important_data
+    def _get_shots_subtitles(self,movie_annotation):
+        shots_subtitles={}
+        if movie_annotation['story'] is not None:
+            for section in movie_annotation['story']:
+                for shot in section['subtitle']:
+                    shot_number=shot['shot']
+                    shot_subtitle=' '.join(shot['sentences'])
+                    shots_subtitles[shot_number]=clean_text(shot_subtitle)
+        return shots_subtitles
+    def prepare_input_images(self,clip_path,shots_subtitles,use_subtitles):
+        total_frames=len(os.listdir(clip_path))
+        movie_name=clip_path.split('/')[-2]
+        clip_name=clip_path.split('/')[-1]
+        sampling_interval=int(total_frames//self.max_num_images)
+        if sampling_interval==0:
+            sampling_interval=1
+        use_subtitles_save_name="subtitles" if use_subtitles else "no_subtitles"
+        video_frames_path = os.path.join(clip_path)
+        total_num_frames=len(os.listdir(video_frames_path))
+        sampling_interval = round(total_num_frames / self.max_num_images)
+        if sampling_interval == 0:
+            sampling_interval = 1
+        number_of_words=0
+        video_images_list=sorted(os.listdir(video_frames_path))
+        images = []
+        img_placeholder = ""
+        for i,frame in enumerate(video_images_list):
+            if i % sampling_interval == 0:
+                frame = Image.open(os.path.join(video_frames_path,frame)).convert("RGB")
+                frame = self.vis_processor(frame)
+                images.append(frame)
+                img_placeholder += '<Img><ImageHere>'
+                shot_num=video_images_list[i].split('_')[1]
+                if shots_subtitles.get(shot_num) is not None:
+                    sub=clean_text(shots_subtitles[shot_num])
+                    number_of_words+=len(sub.split(' '))
+                    if number_of_words<= self.max_sub_len and use_subtitles:
+                        img_placeholder+=f'<Cap>{sub}'
+            if len(images) >= self.max_num_images:
+                break
+        if len(images) ==0:
+            print("Video not found",video_frames_path)
+        if 0 <len(images) < self.max_num_images:
+            last_item = images[-1]
+            while len(images) < self.max_num_images:
+                images.append(last_item)
+                img_placeholder += '<Img><ImageHere>'
+        images = torch.stack(images)
+        return images,img_placeholder
+    def _get_movie_summaries(self,video_images_path,use_subtitles,shots_subtitles,movie_clips_path):
+        video_images_list=sorted(os.listdir(video_images_path))
+        max_caption_index = 0
+        preds = {}
+        movie_name=movie_clips_path.split('/')[-1]
+        videos_summaries=[]
+        previous_caption=""
+        batch_size=args.batch_size
+        batch_images=[]
+        batch_instructions=[]
+        clip_numbers=[]
+        clip_number=0
+        conversations=[]
+        for i in tqdm(range(0,len(video_images_list),135), desc="Inference video clips", total=len(video_images_list)/120):
+            images=[]
+            # Add the previous caption to the new video clip
+            # if batch_size==1:
+            #     previous_caption="You are analysing a one long video of mutiple clips and this is the summary from all previous clips :"+videos_summaries[-1] +"\n\n"if len(videos_summaries)>0 else ""
+            if previous_caption != "":
+                img_placeholder = previous_caption+" "
+            else:
+                img_placeholder = ""
+            number_of_words=0
+            max_num_words=400
+            max_num_images=45
+            clip_number_str=str(clip_number).zfill(2)
+            clip_path=os.path.join(movie_clips_path,f"{movie_name}_clip_{clip_number_str}")
+            os.makedirs(clip_path, exist_ok=True)
+            conversation=""
+            for j in range(i,i+135,3):
+                if j >= len(video_images_list):
+                    break
+                image_path = os.path.join(video_images_path, video_images_list[j])
+                # copy the images to clip folder
+                # if the image is already copied, skip it
+                if not os.path.exists(os.path.join(clip_path,video_images_list[j])):
+                    shutil.copy(image_path,clip_path)
+                img=Image.open(image_path)
+                images.append(self.vis_processor(img))
+                img_placeholder += '<Img><ImageHere>'
+                shot_num=int(video_images_list[j].split('_')[1])
+                if use_subtitles:
+                    if shots_subtitles.get(shot_num) is not None:
+                        sub=clean_text(shots_subtitles[shot_num])
+                        number_of_words+=len(sub.split(' '))
+                        if number_of_words<= max_num_words and use_subtitles:
+                            img_placeholder+=f'<Cap>{sub}'
+                        conversation+=sub+" "
+                if len(images) >= max_num_images:
+                    break
+            if len(images) ==0:
+                print("Video not found",video_images_path)
+                continue
+            if 0 <len(images) < max_num_images:
+                last_item = images[-1]
+                while len(images) < max_num_images:
+                    images.append(last_item)
+                    img_placeholder += '<Img><ImageHere>'
+            images = torch.stack(images)
+            print(images.shape)
+            clip_numbers.append(clip_number_str)
+            clip_number+=1
+            conversations.append(clean_text(conversation))
+            instruction = img_placeholder + '\n' + self.summary_instruction
+            batch_images.append(images)
+            batch_instructions.append(instruction)
+            if len(batch_images) < batch_size:
+                continue
+            # run inference for the batch
+            batch_images = torch.stack(batch_images)
+            batch_pred=self.run_images(batch_images,batch_instructions)
+            for i,pred in enumerate(batch_pred):
+                max_caption_index += 1
+                videos_summaries.append(pred)
+                if args.use_coherent_description:
+                    preds[f'caption_{max_caption_index}__{movie_name}_clip_{clip_numbers[i]}'] = f"model_summary :{pred}\nVideo conversation :{conversations[i]}"
+                else:
+                    preds[f'caption_{max_caption_index}__{movie_name}_clip_{clip_numbers[i]}'] = pred
+                    if conversations[i]!="" and use_subtitles:
+                        preds[f'subtitle_{max_caption_index}__{movie_name}_clip_{clip_numbers[i]}'] = conversations[i]
+            batch_images=[]
+            batch_instructions=[]
+            clip_numbers=[]
+            conversations=[]
+        # run inference for the last batch
+        if len(batch_images)>0:
+            batch_images = torch.stack(batch_images)
+            batch_pred=self.run_images(batch_images,batch_instructions)
+            for k,pred in enumerate(batch_pred):
+                max_caption_index += 1
+                videos_summaries.append(pred)
+                if args.use_coherent_description:
+                    preds[f'caption_{max_caption_index}__{movie_name}_clip_{clip_numbers[k]}'] = f"model_summary :{pred}\nVideo conversation :{conversations[k]}"
+                else:
+                    preds[f'caption_{max_caption_index}__{movie_name}_clip_{clip_numbers[k]}'] = pred
+                    if conversations[k]!="" and use_subtitles:
+                        preds[f'subtitle_{max_caption_index}__{movie_name}_clip_{clip_numbers[k]}'] = conversations[k]
+            batch_images=[]
+            batch_instructions=[]
+        return preds
+    def movie_inference(self,videoname,use_subtitles):
+        embedding_path=os.path.join(self.save_pkls_path,f"{videoname}.pkl")
+        if args.index_subtitles_together:
+            file_path=os.path.join(self.save_json_path,f"{videoname}.json")
+            embedding_path=os.path.join(self.save_pkls_path,f"{videoname}.pkl")
+        else:
+            file_path=os.path.join(self.save_json_path,f"no_subtiltles_{videoname}.json")
+            embedding_path=os.path.join(self.save_pkls_path,f"no_subtiltles_{videoname}.pkl")
+        if args.subtitles_only:
+            file_path=os.path.join(self.save_json_path,f"subtiltles_only_{videoname}.json")
+            embedding_path=os.path.join(self.save_pkls_path,f"subtiltles_only_{videoname}.pkl")
+        if os.path.exists(file_path):
+            print("Already processed")
+            return file_path,embedding_path
+        important_data = {}
+        video_images_path,subtitle_path,movie_annotation,movie_clips_path=self._get_movie_data(videoname)
+        shots_subtitles={}
+        if use_subtitles:
+            if movie_annotation['story'] is not None:
+                shots_subtitles=self._get_shots_subtitles(movie_annotation)
+        if args.subtitles_only:
+            number_of_paragraphs=20
+            important_data=self._store_subtitles_paragraphs(subtitle_path,important_data,number_of_paragraphs)
+        else:
+            preds=self._get_movie_summaries(video_images_path,use_subtitles,shots_subtitles,movie_clips_path)
+            if len(shots_subtitles)==0 and use_subtitles:
+                number_of_paragraphs=len(preds)
+                important_data=self._store_subtitles_paragraphs(subtitle_path,important_data,number_of_paragraphs)
+            important_data.update(preds)
+        with open(file_path, 'w') as file:
+            json.dump(important_data, file, indent=4)
+        return file_path,embedding_path
+    def answer_movie_questions_RAG(self,qa_list,information_RAG_path,embedding_path):
+        QA_external_memory=MemoryIndex(args.neighbours, use_openai=args.use_openai_embedding)
+        if os.path.exists(embedding_path):
+            QA_external_memory.load_embeddings_from_pkl(embedding_path)
+        else:
+            QA_external_memory.load_documents_from_json(information_RAG_path,embedding_path)
+        summarization_external_memory=MemoryIndex(-1, use_openai=args.use_openai_embedding)
+        if os.path.exists(embedding_path):
+            summarization_external_memory.load_embeddings_from_pkl(embedding_path)
+        else:
+            summarization_external_memory.load_documents_from_json(information_RAG_path,embedding_path)
+        # get the most similar context from the external memory to this instruction
+        general_related_context_keys_list=[]
+        general_related_context_documents_list=[]
+        summary_related_context_documents_list=[]
+        summary_related_context_keys_list=[]
+        total_batch_pred=[]
+        related_text=[]
+        qa_genearl_prompts=[]
+        qa_summary_prompts=[]
+        qa_general=[]
+        qa_summary=[]
+        for qa in qa_list:
+            if qa['q_type']=='summary':
+                related_context_documents,related_context_keys = summarization_external_memory.search_by_similarity(qa['Q'])
+                summary_related_context_documents_list.append(related_context_documents)
+                summary_related_context_keys_list.append(related_context_keys)
+                prompt=self.prepare_prompt(qa)
+                qa_summary_prompts.append(prompt)
+                qa_summary.append(qa)
+            else:
+                related_context_documents,related_context_keys = QA_external_memory.search_by_similarity(qa['Q'])
+                general_related_context_keys_list.append(related_context_keys)
+                general_related_context_documents_list.append(related_context_documents)
+                prompt=self.prepare_prompt(qa)
+                qa_genearl_prompts.append(prompt)
+                qa_general.append(qa)
+        # if I have summary questions answer first, without the need to use clips for information
+        if len(qa_summary_prompts)>0:
+            # Here the retrieved clips are all movie clips
+            context_information_list=[]
+            for related_context_keys in summary_related_context_keys_list:
+                most_related_clips=self.get_most_related_clips(related_context_keys)
+                context_information=""
+                for clip_name in most_related_clips:
+                    clip_conversation=""
+                    general_sum=""
+                    for key in related_context_keys:
+                        if clip_name in key and 'caption' in key:
+                            general_sum="Clip Summary: "+summarization_external_memory.documents[key]
+                        if clip_name in key and 'subtitle' in key:
+                            clip_conversation="Clip Subtitles: "+summarization_external_memory.documents[key]
+                    if args.use_coherent_description:
+                        context_information+=f"{general_sum}\n"
+                    else:
+                        if args.model_summary_only:
+                            context_information+=f"{general_sum}\n"
+                        elif args.subtitles_only:
+                            context_information+=f"{clip_conversation}\n"
+                        else:
+                            context_information+=f"{general_sum},{clip_conversation}\n"
+                context_information_list.append(context_information)
+            if args.use_chatgpt :
+                batch_pred=self.inference_RAG_chatGPT(qa_summary_prompts,context_information_list)
+            else:
+                batch_pred=self.inference_RAG(qa_summary_prompts,context_information_list)
+            total_batch_pred.extend(batch_pred)
+            related_text.extend(context_information_list)
+        if args.use_clips_for_info:
+            batch_pred,general_related_context_keys_list=self.use_clips_for_info(qa_general,general_related_context_keys_list,QA_external_memory)
+            total_batch_pred.extend(batch_pred)
+            related_text.extend(general_related_context_keys_list)
+        else:
+            related_context_documents_text_list=[]
+            for related_context_documents,related_context_keys in zip(general_related_context_documents_list,general_related_context_keys_list):
+                related_information=""
+                most_related_clips=self.get_most_related_clips(related_context_keys)
+                for clip_name in most_related_clips:
+                    clip_conversation=""
+                    general_sum=""
+                    for key in QA_external_memory.documents.keys():
+                        if clip_name in key and 'caption' in key:
+                            general_sum="Clip Summary: "+QA_external_memory.documents[key]
+                        if clip_name in key and 'subtitle' in key:
+                            clip_conversation="Clip Subtitles: "+QA_external_memory.documents[key]
+                    if args.use_coherent_description:
+                        related_information+=f"{general_sum}\n"
+                    else:
+                        if args.model_summary_only:
+                            related_information+=f"{general_sum}\n"
+                        elif args.subtitles_only:
+                            related_information+=f"{clip_conversation}\n"
+                        else:
+                            related_information+=f"{general_sum},{clip_conversation}\n"
+                related_context_documents_text_list.append(related_information)
+            if len (qa_genearl_prompts) >0 and args.use_chatgpt :
+                batch_pred=self.inference_RAG_chatGPT(qa_genearl_prompts,related_context_documents_text_list)
+            elif len (qa_genearl_prompts) >0:
+                batch_pred=self.inference_RAG(qa_genearl_prompts,related_context_documents_text_list)
+                total_batch_pred.extend(batch_pred)
+                related_text.extend(related_context_documents_text_list)
+        assert len(total_batch_pred)==len(related_text)
+        return total_batch_pred, related_text
+    def get_most_related_clips(self,related_context_keys):
+        most_related_clips=[]
+        for context_key in related_context_keys:
+            if len(context_key.split('__'))>1:
+                most_related_clips.append(context_key.split('__')[1])
+            if len(most_related_clips)==args.neighbours:
+                break
+        assert len(most_related_clips)!=0, f"No related clips found {related_context_keys}"
+        return most_related_clips
+    def clip_inference(self,clips_name,prompts):
+        setup_seeds(seed)
+        images_batch, instructions_batch = [], []
+        for clip_name, prompt in zip(clips_name, prompts):
+            movie_name=clip_name.split('_')[0]
+            video_images_path,subtitle_path,movie_annotation,movie_clips_path=self._get_movie_data(movie_name)
+            clip_path=os.path.join(movie_clips_path,clip_name)
+            if movie_annotation['story'] is not None:
+                shots_subtitles=self._get_shots_subtitles(movie_annotation)
+            else:
+                shots_subtitles={}
+            images,img_placeholder=self.prepare_input_images(clip_path,shots_subtitles,use_subtitles=not args.vision_only)
+            instruction = img_placeholder + '\n' + prompt
+            images_batch.append(images)
+            instructions_batch.append(instruction)
+        # run inference for the batch
+        images_batch=torch.stack(images_batch)
+        batch_pred=self.run_images(images_batch,instructions_batch)
+        return batch_pred
+    def prepare_prompt(self,qa):
+        prompt=qa["Q"]
+        return prompt
+    def use_clips_for_info(self,qa_list,related_context_keys_list,external_memory):
+            total_batch_pred=[]
+            questions=[]
+            related_information_list=[]
+            related_context_keys_list_new=[]
+            for qa,related_context_keys in zip(qa_list,related_context_keys_list):
+                most_related_clips=self.get_most_related_clips(related_context_keys)
+                question=qa['Q']
+                # prompt=self.prepare_prompt(qa)
+                # prompt+=" and also provide an EXPLAINATION for your answer and If you don't know the answer, say that you don't know.\n\n"
+                prompt=f"From this video extract the related information to This question and provide an explaination for your answer and If you can't find related information, say 'I DON'T KNOW' as option 5 because maybe the questoin is not related to the video content.\n the question is :\n {question}\n your answer :"
+                # all_info=self.clip_inference(most_related_clips,[prompt]*len(most_related_clips))
+                # make the most_related_clips has unique elements (if retrival from vision summary and conversations)
+                most_related_clips=list(set(most_related_clips))
+                batch_inference=[]
+                all_info=[]
+                for related_clip in most_related_clips:
+                    batch_inference.append(related_clip)
+                    if len(batch_inference)<args.batch_size:
+                        continue
+                    all_info.extend(self.clip_inference(batch_inference,[prompt]*len(batch_inference)))
+                    batch_inference=[]
+                if len(batch_inference)>0:
+                    all_info.extend(self.clip_inference(batch_inference,[prompt]*len(batch_inference)))
+                related_information=""
+                for info,clip_name in zip(all_info,most_related_clips):
+                    clip_conversation=""
+                    general_sum=""
+                    for key in external_memory.documents.keys():
+                        if clip_name in key and 'caption' in key:
+                            general_sum="Clip Summary: "+external_memory.documents[key]
+                        if clip_name in key and 'subtitle' in key:
+                            clip_conversation="Clip Subtitles: "+external_memory.documents[key]
+                    if args.use_coherent_description:
+                        related_information+=f"question_related_information: {info},{general_sum}\n"
+                    else:
+                        if args.model_summary_only:
+                            related_information+=f"{general_sum},question_related_information: {info}\n"
+                        elif args.info_only:
+                            related_information+=f"question_related_information: {info}\n"
+                        elif args.subtitles_only:
+                            related_information+=f"{clip_conversation},question_related_information: {info}\n"
+                        else:
+                            related_information+=f"{general_sum},{clip_conversation},question_related_information: {info}\n"
+                        # related_information+=f"question_related_information: {info},{clip_conversation}\n"
+                questions.append(question)
+                related_information_list.append(related_information)
+                related_context_keys.append(related_information)
+                related_context_keys_list_new.append(related_context_keys)
+                if len(questions)< args.batch_size:
+                    continue
+                setup_seeds(seed)
+                if args.use_chatgpt :
+                    batch_pred=self.inference_RAG_chatGPT(questions, related_information_list)
+                else:
+                    batch_pred=self.inference_RAG(questions, related_information_list)
+                for pred in batch_pred:
+                    total_batch_pred.append(pred)
+                questions=[]
+                related_information_list=[]
+            if len(questions)>0:
+                setup_seeds(seed)
+                if args.use_chatgpt :
+                    batch_pred=self.inference_RAG_chatGPT(questions, related_information_list)
+                else:
+                    batch_pred=self.inference_RAG(questions, related_information_list)
+                for pred in batch_pred:
+                    total_batch_pred.append(pred)
+            return total_batch_pred,related_context_keys_list_new
+    def define_save_name(self):
+        save_name="subtitles" if args.index_subtitles_together else "no_subtitles"
+        save_name+="_clips_for_info" if args.use_clips_for_info else ""
+        save_name+="_chatgpt" if args.use_chatgpt else ""
+        save_name+="_vision_only" if args.vision_only else ""
+        save_name+="_model_summary_only" if args.model_summary_only else ""
+        save_name+="_subtitles_only" if args.subtitles_only else ""
+        save_name+="_info_only" if args.info_only else ""
+        print("save_name",save_name)
+        return save_name
+    def eval_llama_vid(self):
+        ## LLAMa vid QA evaluation
+        full_questions_result=[]
+        movie_number=0
+        start=args.start
+        end=args.end
+        save_name=self.define_save_name()
+        for movie in tqdm(self.movies_dict.keys()):
+            if args.start <=movie_number < args.end:
+                save_dir=f"new_workspace/results/llama_vid/{args.exp_name}/{save_name}_{args.neighbours}_neighbours"
+                if os.path.exists( f"{save_dir}/{movie}.json" ):
+                    print(f"Movie {movie} already processed")
+                    with open(f"{save_dir}/{movie}.json", 'r') as f:
+                        pred_json = json.load(f)
+                    full_questions_result.extend(pred_json)
+                    continue
+                use_subtitles_while_generating_summary=not args.vision_only
+                information_RAG_path,embedding_path=self.movie_inference(movie,use_subtitles_while_generating_summary)
+                external_memory=MemoryIndex(args.neighbours, use_openai=args.use_openai_embedding)
+                if os.path.exists(embedding_path):
+                    external_memory.load_embeddings_from_pkl(embedding_path)
+                else:
+                    external_memory.load_documents_from_json(information_RAG_path,emdedding_path=embedding_path)
+                save_dir=f"new_workspace/results/llama_vid/{args.exp_name}/{save_name}_{args.neighbours}_neighbours"
+                os.makedirs(save_dir, exist_ok=True)
+                pred_json=[]
+                batch_questions=[]
+                for qa in tqdm(self.movies_dict[movie],desc="Inference questions"):
+                    batch_questions.append(qa)
+                    if len(batch_questions)<args.batch_size:
+                        continue
+                    model_ans,related_text=self.answer_movie_questions_RAG(batch_questions,information_RAG_path,embedding_path)
+                    for qa,ans,related_info in zip(batch_questions,model_ans,related_text):
+                        qa.update({'pred':ans})
+                        qa.update({'related_info':related_info})
+                        pred_json.append(qa)
+                    batch_questions=[]
+                if len(batch_questions)>0:
+                    model_ans,related_text=self.answer_movie_questions_RAG(batch_questions,information_RAG_path,embedding_path)
+                    for qa,ans,related_info in zip(batch_questions,model_ans,related_text):
+                        qa.update({'pred':ans})
+                        qa.update({'related_info':related_info})
+                        pred_json.append(qa)
+                full_questions_result.extend(pred_json)
+                with open(f"{save_dir}/{movie}.json", 'w') as fp:
+                    json.dump(pred_json, fp)
+                print(f"Movie {movie} prediction saved to {save_dir}/{movie}.json")
+            movie_number+=1
+        with open(f"{save_dir}/full_pred_s{start}_end{end}.json", 'w') as fp:
+            json.dump(full_questions_result, fp)
+args=get_arguments()
+def setup_seeds(seed):
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed(seed)
+    cudnn.benchmark = False
+    cudnn.deterministic = True
+import yaml
+# read this file test_configs/llama2_test_config.yaml
+with open('test_configs/llama2_test_config.yaml') as file:
+    config = yaml.load(file, Loader=yaml.FullLoader)
+seed=config['run']['seed']
+print("seed",seed)
+if __name__ == "__main__":
+    setup_seeds(seed)
+    llama_vid_eval=LlamaVidQAEval(args)
+    llama_vid_eval.eval_llama_vid()

evaluation/eval_goldfish_movie_chat.py ADDED Viewed

	@@ -0,0 +1,453 @@

+import sys
+import os
+project_dir = os.getcwd()
+sys.path.append(project_dir)
+import json
+from tqdm import tqdm
+from goldfish_lv import GoldFish_LV,split_subtitles,time_to_seconds
+import argparse
+import json
+import argparse
+import torch
+from tqdm import tqdm
+# from openai import OpenAI
+from minigpt4.common.eval_utils import init_model
+from minigpt4.conversation.conversation import CONV_VISION
+from index import MemoryIndex
+import pysrt
+import chardet
+import torch
+import random
+import numpy as np
+import torch.backends.cudnn as cudnn
+def str2bool(v):
+    if isinstance(v, bool):
+        return v
+    if v.lower() in ('yes', 'true', 't', 'y', '1'):
+        return True
+    elif v.lower() in ('no', 'false', 'f', 'n', '0'):
+        return False
+    else:
+        raise argparse.ArgumentTypeError('Boolean value expected.')
+def get_arguments():
+    parser = argparse.ArgumentParser(description="Inference parameters")
+    parser.add_argument("--neighbours", type=int, default=-1)
+    parser.add_argument("--neighbours_global", type=int, default=-1)
+    parser.add_argument("--fps", type=float, default=0.5)
+    parser.add_argument("--name", type=str,default="ckpt_92",help="name of the experiment")
+    parser.add_argument("--add_unknown", action='store_true')
+    parser.add_argument("--use_chatgpt", action='store_true')
+    parser.add_argument("--use_choices_for_info", action='store_true')
+    parser.add_argument("--use_gt_information", action='store_true')
+    parser.add_argument("--inference_text", action='store_true')
+    parser.add_argument("--use_gt_information_with_distraction", action='store_true')
+    parser.add_argument("--num_distraction", type=int, default=2)
+    parser.add_argument("--add_confidance_score", action='store_true')
+    parser.add_argument("--use_original_video", action='store_true')
+    parser.add_argument("--use_video_embedding", action='store_true')
+    parser.add_argument("--use_clips_for_info", action='store_true')
+    parser.add_argument("--use_GT_video", action='store_true')
+    parser.add_argument("--use_gt_summary", action='store_true')
+    parser.add_argument("--index_subtitles", action='store_true')
+    parser.add_argument("--index_subtitles_together", action='store_true')
+    parser.add_argument("--ask_the_question_early", action='store_true')
+    parser.add_argument("--clip_in_ask_early", action='store_true')
+    parser.add_argument("--summary_with_subtitles_only", action='store_true')
+    parser.add_argument("--use_coherent_description", action='store_true')
+    parser.add_argument("--v_sum_and_info", action='store_true')
+    parser.add_argument("--start", default=0, type=int)
+    parser.add_argument("--end", default=100000, type=int)
+    parser.add_argument("--exp_name", type=str,default="",help="name of eval folder")
+    parser.add_argument("--cfg-path", default="test_configs/llama2_test_config.yaml")
+    parser.add_argument("--ckpt", type=str, default="checkpoints/video_llama_checkpoint_last.pth")
+    parser.add_argument("--add_subtitles", action='store_true')
+    parser.add_argument("--eval_opt", type=str, default='all')
+    parser.add_argument("--max_new_tokens", type=int, default=300)
+    parser.add_argument("--batch_size", type=int, default=8)
+    parser.add_argument("--lora_r", type=int, default=64)
+    parser.add_argument("--lora_alpha", type=int, default=16)
+    parser.add_argument("--video_path", type=str, help="path to the video")
+    parser.add_argument("--use_openai_embedding",type=str2bool, default=False)
+    parser.add_argument("--dataset_videos_path", type=str, help="path to the dataset videos")
+    parser.add_argument("--annotation_json_folder", type=str, help="path to the annotation folder")
+    parser.add_argument("--options", nargs="+")
+    return parser.parse_args()
+def get_movie_time(subtitle_path):
+    # read the subtitle file and detect the encoding
+    with open(subtitle_path, 'rb') as f:
+        result = chardet.detect(f.read())
+    subtitles = pysrt.open(subtitle_path, encoding=result['encoding'])
+    video_time=time_to_seconds(subtitles[-1].end)
+    return video_time
+import torch
+from torch.utils.data import Dataset, DataLoader
+from torchvision.transforms import Compose
+import h5py
+import torch
+import os
+def numerical_sort_key(filename):
+    base_name = os.path.splitext(filename)[0]
+    return int(base_name)
+class MovieChatDataset(Dataset):
+    def __init__(self, dataset_path, annotation_path,fps, transform=None,start=0,end=100000):
+        self.dataset_path = dataset_path
+        self.annotation_path=annotation_path
+        self.transform = transform
+        self.movie_name = os.listdir(dataset_path)
+        self.movie_name = [file for file in self.movie_name if file != '.DS_Store']
+        self.fps = fps
+        self.len_clip = 45
+        self.start=start
+        self.end=end
+    def load_frames(self, movie_name):
+        filenames = sorted(os.listdir(os.path.join(self.dataset_path, movie_name)))
+        filenames.sort(key=numerical_sort_key)
+        # define torch tensor to store the frames of size(0,0,0)
+        data = []
+        for filename_number in tqdm(filenames,desc="Loading frames"):
+            file_path = os.path.join(self.dataset_path, movie_name, filename_number)
+            if not os.path.isfile(file_path):
+                print(f"Did not find file: {filename_number}")
+            try:
+                with h5py.File(file_path, 'r') as h5_file:
+                    image_embeds=torch.tensor(h5_file[f"frames_{filename_number[:-3]}"][:])
+                    image_embeds = image_embeds[:,1:,:] # remove the first token (CLS) (200,256,1408)
+                    # concate each 4 neighbours image tokens
+                    bs, pn, hs = image_embeds.shape
+                    image_embeds = image_embeds.view(bs, int(pn/4), int(hs*4))
+                    data.extend(image_embeds)
+            except Exception as e:
+                print(f"Failed to process {filename_number}: {e}")
+        frames=torch.stack(data)
+        return frames
+    def __len__(self):
+        return len(self.movie_name)
+    def _get_movie_questions(self,movie_annotations):
+        global_questions=movie_annotations['global']
+        local_questions=movie_annotations['breakpoint']
+        return global_questions,local_questions
+    def __getitem__(self, idx):
+        if self.start<=idx<self.end:
+            self.frames = self.load_frames(self.movie_name[idx])
+            movie_name=self.movie_name[idx]
+            with open(os.path.join(self.annotation_path,movie_name+".json"), 'r') as f:
+                movie_annotations = json.load(f)
+            global_questions,local_questions=self._get_movie_questions(movie_annotations)
+            sampling_value = int(movie_annotations['info']['fps']/self.fps)
+            clips_list=[]
+            current_clip=[]
+            for i in range(0,self.frames.shape[0], sampling_value):
+                current_clip.append(self.frames[i])
+                if len(current_clip) >= self.len_clip:
+                    clips_list.append(torch.stack(current_clip))
+                    current_clip=[]
+            if len(current_clip) > 0:
+                last_frame_current_clip = current_clip[-1]
+                while len(current_clip) < self.len_clip:
+                    current_clip.append(last_frame_current_clip)
+                clips_list.append(torch.stack(current_clip))
+            return clips_list, movie_name,global_questions,local_questions
+        else:
+            return [], self.movie_name[idx],[],[]
+class MovieChat (GoldFish_LV):
+    def __init__(self,args):
+        super().__init__(args)
+        self.args=args
+        self.save_long_videos_path = "new_workspace/clips_summary/movie_chat/"
+        if args.use_openai_embedding:
+            self.save_embedding_path = "new_workspace/open_ai_embedding/movie_chat/"
+        else:
+            self.save_embedding_path = "new_workspace/embedding/movie_chat/"
+        os.makedirs(self.save_long_videos_path, exist_ok=True)
+        os.makedirs(self.save_embedding_path, exist_ok=True)
+        self.max_sub_len=400
+        self.max_num_images=45
+    def _get_long_video_summaries(self,clips,save_path):
+        batch=[]
+        batch_instructions=[]
+        preds={}
+        clip_numbers=[]
+        max_caption_index=0
+        for i,clip_features in enumerate(clips):
+            if len(clip_features)!=self.max_num_images:
+                continue
+            batch.append(clip_features)
+            img_placeholder=""
+            for j in range(len(clip_features)):
+                img_placeholder+="<Img><ImageHere>"
+            instruction = img_placeholder + '\n' + self.summary_instruction
+            batch_instructions.append(instruction)
+            clip_numbers.append(i)
+            if len(batch)<args.batch_size:
+                continue
+            batch=torch.stack(batch)
+            batch_pred= self.run_images_features(batch,batch_instructions)
+            for j,pred in enumerate(batch_pred):
+                max_caption_index += 1
+                if pred !="":
+                    preds[f'caption__clip_{str(clip_numbers[j]).zfill(2)}'] = pred
+            batch=[]
+            clip_numbers=[]
+            batch_instructions=[]
+        if len(batch)>0:
+            batch=torch.stack(batch)
+            batch_pred= self.run_images_features(batch,batch_instructions)
+            for j,pred in enumerate(batch_pred):
+                max_caption_index += 1
+                if pred !="":
+                    preds[f'caption__clip_{str(clip_numbers[j]).zfill(2)}'] = pred
+        with open(save_path, 'w') as file:
+            json.dump(preds, file, indent=4)
+        return preds
+    def use_model_summary (self,qa_prompts,related_context_documents_list,related_context_keys_list,external_memory):
+        related_context_documents_text_list=[]
+        for related_context_documents,related_context_keys in zip(related_context_documents_list,related_context_keys_list):
+            related_information=""
+            most_related_clips=self.get_most_related_clips_index(related_context_keys,external_memory)
+            for clip_name in most_related_clips:
+                general_sum=""
+                clip_name=str(clip_name).zfill(2)
+                for key in external_memory.documents.keys():
+                    if clip_name in key and 'caption' in key:
+                        general_sum="Clip Summary: "+external_memory.documents[key]
+                        break
+                related_information+=f"{general_sum}\n"
+            related_context_documents_text_list.append(related_information)
+        if args.use_chatgpt :
+            batch_pred=self.inference_RAG_chatGPT(qa_prompts,related_context_documents_text_list)
+        else:
+            batch_pred=self.inference_RAG(qa_prompts,related_context_documents_text_list)
+        return batch_pred, related_context_documents_text_list
+    def answer_movie_questions_RAG(self,qa_list,information_RAG_path,embedding_path,q_type):
+        if q_type=='local':
+            external_memory=MemoryIndex(args.neighbours, use_openai=self.args.use_openai_embedding)
+        else:
+            external_memory=MemoryIndex(args.neighbours_global, use_openai=self.args.use_openai_embedding)
+        if os.path.exists(embedding_path):
+            external_memory.load_embeddings_from_pkl(embedding_path)
+        else:
+            external_memory.load_documents_from_json(information_RAG_path,embedding_path)
+        # get the most similar context from the external memory to this instruction
+        related_context_documents_list=[]
+        related_context_keys_list=[]
+        total_batch_pred=[]
+        related_text=[]
+        qa_prompts=[]
+        for qa in qa_list:
+            related_context_documents,related_context_keys = external_memory.search_by_similarity(qa['question'])
+            related_context_documents_list.append(related_context_documents)
+            related_context_keys_list.append(related_context_keys)
+            prompt=self.prepare_prompt(qa)
+            qa_prompts.append(prompt)
+        if args.use_clips_for_info:
+            batch_pred,related_context_keys_list=self.use_clips_for_info(qa_list,related_context_keys_list,external_memory)
+            total_batch_pred.extend(batch_pred)
+            related_text.extend(related_context_keys_list)
+        else:
+            batch_pred, related_context_documents_text_list=self.use_model_summary (qa_prompts,
+                        related_context_documents_list,related_context_keys_list,external_memory)
+            total_batch_pred.extend(batch_pred)
+            related_text.extend(related_context_documents_text_list)
+        assert len(total_batch_pred)==len(qa_list)
+        assert len(total_batch_pred)==len(related_text)
+        return total_batch_pred, related_text
+    def get_most_related_clips_index(self,related_context_keys,external_memory):
+        most_related_clips_index=[]
+        for context_key in related_context_keys:
+            # loop over memory keys to get the context key index
+            for i,key in enumerate(external_memory.documents.keys()):
+                if context_key in key:
+                    most_related_clips_index.append(i)
+                    break
+        return most_related_clips_index
+    def clip_inference(self,clips_idx,prompts):
+        setup_seeds(seed)
+        images_batch, instructions_batch = [], []
+        for clip_idx, prompt in zip(clips_idx, prompts):
+            clip_features=self.video_clips[clip_idx]
+            img_placeholder=""
+            for j in range(len(clip_features)):
+                img_placeholder+='<Img><ImageHere>'
+            instruction = img_placeholder + '\n' + prompt
+            images_batch.append(clip_features)
+            instructions_batch.append(instruction)
+        # run inference for the batch
+        images_batch=torch.stack(images_batch)
+        batch_pred= self.run_images_features(images_batch,instructions_batch)
+        return batch_pred
+    def prepare_prompt(self,qa):
+        prompt=qa["question"]
+        return prompt
+    def use_clips_for_info(self,qa_list,related_context_keys_list,external_memory):
+        total_batch_pred=[]
+        questions=[]
+        related_information_list=[]
+        related_context_keys_list_new=[]
+        for qa,related_context_keys in zip(qa_list,related_context_keys_list):
+            most_related_clips_index=self.get_most_related_clips_index(related_context_keys,external_memory)
+            question=qa['question']
+            prompt=f"From this video extract the related information to This question and provide an explaination for your answer and If you can't find any related information, say 'I DON'T KNOW' as option 5 because maybe the questoin is not related to the video content.\n the question is :\n {question}\n your answer :"
+            batch_inference=[]
+            all_info=[]
+            for clip_idx in most_related_clips_index:
+                batch_inference.append(clip_idx)
+                if len(batch_inference)<args.batch_size:
+                    continue
+                all_info.extend(self.clip_inference(batch_inference,[prompt]*len(batch_inference)))
+                batch_inference=[]
+            if len(batch_inference)>0:
+                all_info.extend(self.clip_inference(batch_inference,[prompt]*len(batch_inference)))
+            # all_info=self.clip_inference(most_related_clips_index,[prompt]*len(most_related_clips_index))
+            related_information=""
+            for info,clip_name in zip(all_info,most_related_clips_index):
+                general_sum=""
+                clip_name=str(clip_name).zfill(2)
+                for key in external_memory.documents.keys():
+                    if clip_name in key and 'caption' in key:
+                        general_sum="Clip Summary: "+external_memory.documents[key]
+                if args.v_sum_and_info:
+                    related_information+=f"{general_sum},question_related_information: {info}\n"
+                else:
+                    related_information+=f"question_related_information: {info}\n"
+            questions.append(question)
+            related_information_list.append(related_information)
+            related_context_keys.append(related_information)
+            related_context_keys_list_new.append(related_context_keys)
+            if len(questions)< args.batch_size:
+                continue
+            setup_seeds(seed)
+            if args.use_chatgpt :
+                batch_pred=self.inference_RAG_chatGPT(questions, related_information_list)
+            else:
+                batch_pred=self.inference_RAG(questions, related_information_list)
+            for pred in batch_pred:
+                total_batch_pred.append(pred)
+            questions=[]
+            related_information_list=[]
+        if len(questions)>0:
+            setup_seeds(seed)
+            if args.use_chatgpt :
+                batch_pred=self.inference_RAG_chatGPT(questions, related_information_list)
+            else:
+                batch_pred=self.inference_RAG(questions, related_information_list)
+            for pred in batch_pred:
+                total_batch_pred.append(pred)
+        return total_batch_pred,related_context_keys_list_new
+    def define_save_name(self):
+        save_name="subtitles" if args.index_subtitles else "no_subtitles"
+        save_name="subtitles_together" if args.index_subtitles_together else save_name
+        save_name="summary_with_subtitles_only" if args.summary_with_subtitles_only else save_name
+        save_name+="_unknown" if args.add_unknown else ""
+        save_name+="_clips_for_info" if args.use_clips_for_info else ""
+        save_name+="_chatgpt" if args.use_chatgpt else ""
+        save_name+="_choices_for_info" if args.use_choices_for_info else ""
+        save_name+="_v_sum_and_info" if args.v_sum_and_info else ""
+        save_name+='fps_'+str(args.fps)
+        save_dir=f"new_workspace/results/moviechat/{args.exp_name}/{save_name}_{args.neighbours_global}_neighbours"
+        os.makedirs(save_dir, exist_ok=True)
+        return save_dir
+    def eval_moviechat(self):
+        start=args.start
+        end=args.end
+        dataset_path = args.dataset_videos_path
+        annotation_json_folder=args.annotation_json_folder
+        dataset = MovieChatDataset(dataset_path,annotation_json_folder, fps=args.fps,start=start,end=end)
+        # dataloader = DataLoader(dataset, batch_size=1,  shuffle=False)
+        full_questions_result=[]
+        save_dir=self.define_save_name()
+        for i,(clips ,video_name,global_questions,local_questions) in enumerate(dataset):
+            # code here
+            if start<=i < end:
+                print("video_name",video_name)
+                self.video_clips=clips
+                self.video_name=video_name
+                file_path=os.path.join(self.save_long_videos_path,self.video_name+f"_fps{args.fps}.json")
+                embedding_path=os.path.join(self.save_embedding_path,self.video_name+f"_fps{args.fps}.pkl")
+                if os.path.exists(file_path):
+                    print("Already processed")
+                else:
+                    self._get_long_video_summaries(clips,file_path)
+                batch_questions=[]
+                for qa in global_questions:
+                    batch_questions.append(qa)
+                    if len(batch_questions)<args.batch_size:
+                        continue
+                    model_answers, related_text=self.answer_movie_questions_RAG(batch_questions,file_path,embedding_path,q_type='global')
+                    for qa,ans in zip(batch_questions,model_answers):
+                        qa.update({'pred':ans})
+                        qa['Q']=qa['question']
+                        qa['A']=qa['answer']
+                        qa.pop('question', None)
+                        qa.pop('answer', None)
+                    batch_questions=[]
+                if len(batch_questions)>0:
+                    model_answers, related_text=self.answer_movie_questions_RAG(batch_questions,file_path,embedding_path,q_type='global')
+                    for qa,ans in zip(batch_questions,model_answers):
+                        qa.update({'pred':ans})
+                        qa['Q']=qa['question']
+                        qa['A']=qa['answer']
+                        qa.pop('question', None)
+                        qa.pop('answer', None)
+                full_questions_result.extend(global_questions)
+                print(f"Finished {i} out of {len(dataset)}")
+                # save the results
+                with open(f"{save_dir}/{self.video_name}.json", 'w') as file:
+                    # json.dump(global_questions+local_questions, file, indent=4)
+                    json.dump(global_questions, file, indent=4)
+        with open(f"{save_dir}/full_pred_{start}_{end}.json", 'w') as fp:
+            json.dump(full_questions_result, fp)
+args=get_arguments()
+def setup_seeds(seed):
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed(seed)
+    cudnn.benchmark = False
+    cudnn.deterministic = True
+import yaml
+# read this file test_configs/llama2_test_config.yaml
+with open('test_configs/llama2_test_config.yaml') as file:
+    config = yaml.load(file, Loader=yaml.FullLoader)
+seed=config['run']['seed']
+print("seed",seed)
+if __name__ == "__main__":
+    setup_seeds(seed)
+    llama_vid_eval=MovieChat(args)
+    llama_vid_eval.eval_moviechat()

evaluation/eval_goldfish_movie_qa.py ADDED Viewed

	@@ -0,0 +1,591 @@

+import sys
+import os
+project_dir = os.getcwd()
+sys.path.append(project_dir)
+import json
+from tqdm import tqdm
+from goldfish_lv import GoldFish_LV,split_subtitles,time_to_seconds
+import argparse
+import json
+import argparse
+import torch
+import re
+from tqdm import tqdm
+from PIL import Image
+# from openai import OpenAI
+from index import MemoryIndex
+import pysrt
+import chardet
+import torch
+import random
+import numpy as np
+import torch.backends.cudnn as cudnn
+import shutil
+def str2bool(v):
+    if isinstance(v, bool):
+        return v
+    if v.lower() in ('yes', 'true', 't', 'y', '1'):
+        return True
+    elif v.lower() in ('no', 'false', 'f', 'n', '0'):
+        return False
+    else:
+        raise argparse.ArgumentTypeError('Boolean value expected.')
+def get_arguments():
+    parser = argparse.ArgumentParser(description="Inference parameters")
+    parser.add_argument("--neighbours", type=int, default=-1)
+    parser.add_argument("--name", type=str,default="ckpt_92",help="name of the experiment")
+    parser.add_argument("--add_unknown", action='store_true')
+    parser.add_argument("--use_chatgpt", action='store_true')
+    parser.add_argument("--use_choices_for_info", action='store_true')
+    parser.add_argument("--use_gt_information", action='store_true')
+    parser.add_argument("--inference_text", action='store_true')
+    parser.add_argument("--use_gt_information_with_distraction", action='store_true')
+    parser.add_argument("--num_distraction", type=int, default=2)
+    parser.add_argument("--add_confidance_score", action='store_true')
+    parser.add_argument("--use_original_video", action='store_true')
+    parser.add_argument("--use_video_embedding", action='store_true')
+    parser.add_argument("--use_clips_for_info", action='store_true')
+    parser.add_argument("--use_GT_video", action='store_true')
+    parser.add_argument("--use_gt_summary", action='store_true')
+    parser.add_argument("--index_subtitles", action='store_true')
+    parser.add_argument("--index_subtitles_together", action='store_true')
+    parser.add_argument("--ask_the_question_early", action='store_true')
+    parser.add_argument("--clip_in_ask_early", action='store_true')
+    parser.add_argument("--summary_with_subtitles_only", action='store_true')
+    parser.add_argument("--use_coherent_description", action='store_true')
+    parser.add_argument("--start", default=0, type=int)
+    parser.add_argument("--end", default=100000, type=int)
+    parser.add_argument("--exp_name", type=str,default="",help="name of eval folder")
+    parser.add_argument("--vision_only", action='store_true')
+    parser.add_argument("--model_summary_only", action='store_true')
+    parser.add_argument("--subtitles_only", action='store_true')
+    parser.add_argument("--info_only", action='store_true')
+    parser.add_argument("--cfg-path", default="test_configs/llama2_test_config.yaml")
+    parser.add_argument("--ckpt", type=str, default="checkpoints/video_llama_checkpoint_last.pth")
+    parser.add_argument("--add_subtitles", action='store_true')
+    parser.add_argument("--eval_opt", type=str, default='all')
+    parser.add_argument("--max_new_tokens", type=int, default=300)
+    parser.add_argument("--batch_size", type=int, default=8)
+    parser.add_argument("--lora_r", type=int, default=64)
+    parser.add_argument("--lora_alpha", type=int, default=16)
+    parser.add_argument("--video_path", type=str, help="path to the video")
+    parser.add_argument("--use_openai_embedding",type=str2bool, default=False)
+    parser.add_argument("--annotation_path", type=str, help="path to the annotation file")
+    parser.add_argument("--videos_path", type=str, help="path to the videos directory")
+    parser.add_argument("--subtitle_path", type=str, help="path to the subtitles directory")
+    parser.add_argument("--movienet_annotations_dir", type=str, help="path to the movienet annotations directory")
+    parser.add_argument("--video_clips_saving_path", type=str, help="path to save the splitted small video clips")
+    parser.add_argument("--options", nargs="+")
+    return parser.parse_args()
+def time_to_seconds(subrip_time):
+    return subrip_time.hours * 3600 + subrip_time.minutes * 60 + subrip_time.seconds + subrip_time.milliseconds / 1000
+def get_movie_time(subtitle_path):
+    # read the subtitle file and detect the encoding
+    with open(subtitle_path, 'rb') as f:
+        result = chardet.detect(f.read())
+    subtitles = pysrt.open(subtitle_path, encoding=result['encoding'])
+    video_time=time_to_seconds(subtitles[-1].end)
+    return video_time
+def clean_text(subtitles_text):
+    # Remove unwanted characters except for letters, digits, and single quotes
+    subtitles_text = re.sub(r'[^a-zA-Z0-9\s\']', '', subtitles_text)
+    # Replace multiple spaces with a single space
+    subtitles_text = re.sub(r'\s+', ' ', subtitles_text)
+    return subtitles_text.strip()
+class MovieQAEval (GoldFish_LV):
+    def __init__(self,args):
+        super().__init__(args)
+        self.save_json_path = "new_workspace/clips_summary/movienet"
+        if args.use_openai_embedding:
+            self.save_pkls_path = "new_workspace/open_ai_embedding/movienet"
+        else:
+            self.save_pkls_path = "new_workspace/embedding/movienet"
+        os.makedirs(self.save_json_path, exist_ok=True)
+        movie_qa_dataset_path=args.annotation_path
+        with open(movie_qa_dataset_path, 'r') as f:
+            self.movies_dict = json.load(f)
+        self.max_sub_len=400
+        self.max_num_images=45
+    def _get_movie_data(self,videoname):
+        video_images_path =f"{args.videos_path}/{videoname}"
+        movie_clips_path =f"{args.video_clips_saving_path}/{videoname}"
+        subtitle_path = f"{args.subtitle_path}/{videoname}.srt"
+        annotation_file=f"{args.movienet_annotations_dir}/{videoname}.json"
+        # load the annotation file
+        with open(annotation_file, 'r') as f:
+            movie_annotation = json.load(f)
+        return video_images_path,subtitle_path,movie_annotation,movie_clips_path
+    def _store_subtitles_paragraphs(self,subtitle_path,important_data,number_of_paragraphs):
+        paragraphs=[]
+        movie_name=subtitle_path.split('/')[-1].split('.')[0]
+        # if there is no story, split the subtitles into paragraphs
+        paragraphs = split_subtitles(subtitle_path, number_of_paragraphs)
+        for i,paragraph in enumerate(paragraphs):
+            paragraph=clean_text(paragraph)
+            important_data.update({f"subtitle_{i}__{movie_name}_clip_{str(i).zfill(2)}": paragraph})
+        return important_data
+    def _get_shots_subtitles(self,movie_annotation):
+        shots_subtitles={}
+        if movie_annotation['story'] is not None:
+            for section in movie_annotation['story']:
+                for shot in section['subtitle']:
+                    shot_number=shot['shot']
+                    shot_subtitle=' '.join(shot['sentences'])
+                    shots_subtitles[shot_number]=clean_text(shot_subtitle)
+        return shots_subtitles
+    def prepare_input_images(self,clip_path,shots_subtitles,use_subtitles):
+        total_frames=len(os.listdir(clip_path))
+        sampling_interval=int(total_frames//self.max_num_images)
+        if sampling_interval==0:
+            sampling_interval=1
+        images=[]
+        img_placeholder = ""
+        video_frames_path = os.path.join(clip_path)
+        total_num_frames=len(os.listdir(video_frames_path))
+        sampling_interval = round(total_num_frames / self.max_num_images)
+        if sampling_interval == 0:
+            sampling_interval = 1
+        number_of_words=0
+        video_images_list=sorted(os.listdir(video_frames_path))
+        for i,frame in enumerate(video_images_list):
+            if i % sampling_interval == 0:
+                frame = Image.open(os.path.join(video_frames_path,frame)).convert("RGB")
+                frame = self.vis_processor(frame)
+                images.append(frame)
+                img_placeholder += '<Img><ImageHere>'
+                shot_num=video_images_list[i].split('_')[1]
+                if shots_subtitles.get(shot_num) is not None:
+                    sub=clean_text(shots_subtitles[shot_num])
+                    number_of_words+=len(sub.split(' '))
+                    if number_of_words<= self.max_sub_len and use_subtitles:
+                        img_placeholder+=f'<Cap>{sub}'
+            if len(images) >= self.max_num_images:
+                break
+        if len(images) ==0:
+            print("Video not found",video_frames_path)
+        if 0 <len(images) < self.max_num_images:
+            last_item = images[-1]
+            while len(images) < self.max_num_images:
+                images.append(last_item)
+                img_placeholder += '<Img><ImageHere>'
+        images = torch.stack(images)
+        return images,img_placeholder
+    def _get_movie_summaries(self,video_images_path,use_subtitles,shots_subtitles,movie_clips_path):
+        video_images_list=sorted(os.listdir(video_images_path))
+        max_caption_index = 0
+        preds = {}
+        movie_name=movie_clips_path.split('/')[-1]
+        videos_summaries=[]
+        previous_caption=""
+        batch_size=args.batch_size
+        batch_images=[]
+        batch_instructions=[]
+        clip_numbers=[]
+        clip_number=0
+        conversations=[]
+        for i in tqdm(range(0,len(video_images_list),135), desc="Inference video clips", total=len(video_images_list)/135):
+            images=[]
+            img_placeholder = ""
+            number_of_words=0
+            clip_number_str=str(clip_number).zfill(2)
+            clip_path=os.path.join(movie_clips_path,f"{movie_name}_clip_{clip_number_str}")
+            os.makedirs(clip_path, exist_ok=True)
+            conversation=""
+            for j in range(i,i+135,3):
+                if j >= len(video_images_list):
+                    break
+                image_path = os.path.join(video_images_path, video_images_list[j])
+                # copy the images to clip folder
+                shutil.copy(image_path,clip_path)
+                img=Image.open(image_path)
+                images.append(self.vis_processor(img))
+                img_placeholder += '<Img><ImageHere>'
+                shot_num=int(video_images_list[j].split('_')[1])
+                if use_subtitles:
+                    if shots_subtitles.get(shot_num) is not None:
+                        sub=clean_text(shots_subtitles[shot_num])
+                        number_of_words+=len(sub.split(' '))
+                        if number_of_words<= self.max_num_words :
+                            img_placeholder+=f'<Cap>{sub}'
+                        conversation+=sub+" "
+                if len(images) >= self.max_num_images:
+                    break
+            if len(images) ==0:
+                print("Video not found",video_images_path)
+                continue
+            if 0 <len(images) < self.max_num_images:
+                last_item = images[-1]
+                while len(images) < self.max_num_images:
+                    images.append(last_item)
+                    img_placeholder += '<Img><ImageHere>'
+            images = torch.stack(images)
+            print(images.shape)
+            clip_numbers.append(clip_number_str)
+            clip_number+=1
+            conversations.append(clean_text(conversation))
+            instruction = img_placeholder + '\n' + self.summary_instruction
+            batch_images.append(images)
+            batch_instructions.append(instruction)
+            if len(batch_images) < batch_size:
+                continue
+            # run inference for the batch
+            batch_images = torch.stack(batch_images)
+            batch_pred=self.run_images(batch_images,batch_instructions)
+            for i,pred in enumerate(batch_pred):
+                max_caption_index += 1
+                videos_summaries.append(pred)
+                if args.use_coherent_description:
+                    preds[f'caption_{max_caption_index}__{movie_name}_clip_{clip_numbers[i]}'] = f"model_summary :{pred}\nVideo conversation :{conversations[i]}"
+                else:
+                    preds[f'caption_{max_caption_index}__{movie_name}_clip_{clip_numbers[i]}'] = pred
+                    if conversations[i]!="" and use_subtitles:
+                        preds[f'subtitle_{max_caption_index}__{movie_name}_clip_{clip_numbers[i]}'] = conversations[i]
+            batch_images=[]
+            batch_instructions=[]
+            clip_numbers=[]
+            conversations=[]
+        # run inference for the last batch
+        if len(batch_images)>0:
+            batch_images = torch.stack(batch_images)
+            batch_pred=self.run_images(batch_images,batch_instructions)
+            for k,pred in enumerate(batch_pred):
+                max_caption_index += 1
+                videos_summaries.append(pred)
+                if args.use_coherent_description:
+                    preds[f'caption_{max_caption_index}__{movie_name}_clip_{clip_numbers[k]}'] = f"model_summary :{pred}\nVideo conversation :{conversations[k]}"
+                else:
+                    preds[f'caption_{max_caption_index}__{movie_name}_clip_{clip_numbers[k]}'] = pred
+                    if conversations[k]!="" and use_subtitles:
+                        preds[f'subtitle_{max_caption_index}__{movie_name}_clip_{clip_numbers[k]}'] = conversations[k]
+            batch_images=[]
+            batch_instructions=[]
+        return preds
+    def movie_inference(self,videoname,use_subtitles):
+        embedding_path=os.path.join(self.save_pkls_path,f"{videoname}.pkl")
+        if args.index_subtitles_together:
+            file_path=os.path.join(self.save_json_path,f"{videoname}.json")
+            embedding_path=os.path.join(self.save_pkls_path,f"{videoname}.pkl")
+        else:
+            file_path=os.path.join(self.save_json_path,f"no_subtiltles_{videoname}.json")
+            embedding_path=os.path.join(self.save_pkls_path,f"no_subtiltles_{videoname}.pkl")
+        if args.subtitles_only:
+            file_path=os.path.join(self.save_json_path,f"subtiltles_only_{videoname}.json")
+            embedding_path=os.path.join(self.save_pkls_path,f"subtiltles_only_{videoname}.pkl")
+        if os.path.exists(file_path):
+            print("Already processed")
+            return file_path,embedding_path
+        important_data = {}
+        video_images_path,subtitle_path,movie_annotation,movie_clips_path=self._get_movie_data(videoname)
+        shots_subtitles={}
+        if use_subtitles:
+            if movie_annotation['story'] is not None:
+                shots_subtitles=self._get_shots_subtitles(movie_annotation)
+        if args.subtitles_only:
+            number_of_paragraphs=20
+            important_data=self._store_subtitles_paragraphs(subtitle_path,important_data,number_of_paragraphs)
+        else:
+            preds=self._get_movie_summaries(video_images_path,use_subtitles,shots_subtitles,movie_clips_path)
+            if len(shots_subtitles)==0 and use_subtitles:
+                number_of_paragraphs=len(preds)
+                important_data=self._store_subtitles_paragraphs(subtitle_path,important_data,number_of_paragraphs)
+            important_data.update(preds)
+        with open(file_path, 'w') as file:
+            json.dump(important_data, file, indent=4)
+        return file_path,embedding_path
+    def answer_movie_questions_RAG(self,qa_list,external_memory):
+        # get the most similar context from the external memory to this instruction
+        related_context_keys_list=[]
+        related_context_documents_list=[]
+        related_text=[]
+        questions=[]
+        prompts=[]
+        for qa in qa_list:
+            related_context_documents,related_context_keys = external_memory.search_by_similarity(qa['question'])
+            related_context_documents_list.append(related_context_documents)
+            related_context_keys_list.append(related_context_keys)
+            questions.append(qa)
+            prompt=self.prepare_prompt(qa)
+            prompts.append(prompt)
+        if args.use_clips_for_info:
+            batch_pred,related_context_keys_list=self.use_clips_for_info(qa_list,related_context_keys_list,external_memory)
+            related_text.extend(related_context_keys_list)
+        else:
+            related_context_documents_text_list=[]
+            for related_context_documents,related_context_keys in zip(related_context_documents_list,related_context_keys_list):
+                related_information=""
+                most_related_clips=self.get_most_related_clips(related_context_keys)
+                for clip_name in most_related_clips:
+                    clip_conversation=""
+                    general_sum=""
+                    for key in external_memory.documents.keys():
+                        if clip_name in key and 'caption' in key:
+                            general_sum="Clip Summary: "+external_memory.documents[key]
+                        if clip_name in key and 'subtitle' in key:
+                            clip_conversation="Clip Subtitles: "+external_memory.documents[key]
+                    related_information+=f"{general_sum},{clip_conversation}\n"
+                    if args.model_summary_only:
+                        related_information+=f"{general_sum}\n"
+                    elif args.subtitles_only:
+                        related_information+=f"{clip_conversation}\n"
+                    else:
+                        related_information+=f"{general_sum},{clip_conversation}\n"
+                related_context_documents_text_list.append(related_information)
+            if args.use_chatgpt :
+                batch_pred=self.inference_RAG_chatGPT(prompts,related_context_documents_text_list)
+                related_text.extend(related_context_documents_text_list)
+            else:
+                batch_pred=self.inference_RAG(prompts,related_context_documents_text_list)
+                related_text.extend(related_context_documents_text_list)
+        return batch_pred ,related_text
+    def get_most_related_clips(self,related_context_keys):
+        most_related_clips=[]
+        for context_key in related_context_keys:
+            if len(context_key.split('__'))>1:
+                most_related_clips.append(context_key.split('__')[1])
+            if len(most_related_clips)==args.neighbours:
+                break
+        assert len(most_related_clips)!=0, f"No related clips found {related_context_keys}"
+        return most_related_clips
+    def clip_inference(self,clips_name,prompts):
+        setup_seeds(seed)
+        images_batch, instructions_batch = [], []
+        for clip_name, prompt in zip(clips_name, prompts):
+            movie_name=clip_name.split('_')[0]
+            video_images_path,subtitle_path,movie_annotation,movie_clips_path=self._get_movie_data(movie_name)
+            clip_path=os.path.join(movie_clips_path,clip_name)
+            if movie_annotation['story'] is not None:
+                shots_subtitles=self._get_shots_subtitles(movie_annotation)
+            else:
+                shots_subtitles={}
+            images,img_placeholder=self.prepare_input_images(clip_path,shots_subtitles,use_subtitles=not args.vision_only)
+            instruction = img_placeholder + '\n' + prompt
+            images_batch.append(images)
+            instructions_batch.append(instruction)
+        # run inference for the batch
+        images_batch=torch.stack(images_batch)
+        batch_pred=self.run_images(images_batch,instructions_batch)
+        return batch_pred
+    def prepare_prompt(self,qa):
+        prompt=qa["question"]+" \n As you watched in this video Choose ONE suitable answer from these mutiple choices \n"
+        for i,choice in enumerate(qa['choices']):
+            prompt+=f"option {i}: {choice} \n"
+        if args.add_unknown and args.add_confidance_score:
+            # Add unknown option
+            prompt+=f"option 5: Can't answer based on the provided information\n"
+            prompt+="Your output should be THE NUMBER OF THE CORRECT ANSWER FROM THE CHOICES FROM 0 TO 5 INCLUSIVE and aslo output a CONFIDANCE SCORE FROM 0 TO 5 representing how confident you are with your answer where 0 is the least confident and 5 is the most confident"
+        elif args.add_unknown:
+            prompt+=f"option 5: Can't answer based on the provided information\n"
+            prompt+="Your output should be THE NUMBER OF THE CORRECT ANSWER FROM THE CHOICES FROM 0 TO 5 INCLUSIVE"
+        elif args.add_confidance_score:
+            prompt+="Your output should be THE NUMBER OF THE CORRECT ANSWER FROM THE CHOICES FROM 0 TO 4 INCLUSIVE and aslo output a CONFIDANCE SCORE FROM 0 TO 5 representing how confident you are with your answer where 0 is the least confident and 5 is the most confident"
+        else:
+            prompt+="Your output should be THE NUMBER OF THE CORRECT ANSWER FROM THE CHOICES FROM 0 TO 4 INCLUSIVE"
+        return prompt
+    def use_clips_for_info(self,qa_list,related_context_keys_list,external_memory):
+            total_batch_pred=[]
+            questions=[]
+            related_information_list=[]
+            related_context_keys_list_new=[]
+            for qa,related_context_keys in zip(qa_list,related_context_keys_list):
+                most_related_clips=self.get_most_related_clips(related_context_keys)
+                question=qa['question']+ "\n and these are the options for the question\n\n"
+                for i,choice in enumerate(qa['choices']):
+                    question+=f"option {i}: {choice} \n\n"
+                if args.add_unknown:
+                    question+= "option 5: Can't answer based on the provided information\n\n"
+                    question+="\n Your output should be THE NUMBER OF THE CORRECT ANSWER FROM THE CHOICES FROM 0 TO 5 INCLUSIVE"
+                else:
+                    question+="\n Your output should be THE NUMBER OF THE CORRECT ANSWER FROM THE CHOICES FROM 0 TO 4 INCLUSIVE"
+                if args.use_choices_for_info:
+                    # prompt=self.prepare_prompt(qa)
+                    # prompt+=" and also provide an EXPLAINATION for your answer and If you don't know the answer, say that you don't know.\n\n"
+                    prompt=f"From this video extract the related information to This multichioce question and provide an explaination for your answer and If you can't find any related inforamtion, say 'I DON'T KNOW' as option 5 because maybe the questoin is not related to the video content.\n the question is :\n {question}\n your answer :"
+                else:
+                    prompt=f"As you watched in this video answer this {qa['q']}\n\n and also provide an EXPLAINATION for your answer and If you don't know the answer, say that you don't know.\n\n"
+                # if args.use_choices_for_info:
+                #     prompt=self.prepare_prompt(qa)
+                #     prompt+=" and also provide an EXPLAINATION for your answer and If you don't know the answer, say that you don't know.\n\n"
+                # else:
+                #     prompt=f"As you watched in this video {qa['question']}\n\n and also provide an EXPLAINATION for your answer and If you don't know the answer, say that you don't know.\n\n"
+                # make the most_related_clips has unique elements (if retrival from vision summary and conversations)
+                most_related_clips=list(set(most_related_clips))
+                # all_info=self.clip_inference(most_related_clips,[prompt]*len(most_related_clips))
+                batch_inference=[]
+                all_info=[]
+                for related_clip in most_related_clips:
+                    batch_inference.append(related_clip)
+                    if len(batch_inference)<args.batch_size:
+                        continue
+                    all_info.extend(self.clip_inference(batch_inference,[prompt]*len(batch_inference)))
+                    batch_inference=[]
+                if len(batch_inference)>0:
+                    all_info.extend(self.clip_inference(batch_inference,[prompt]*len(batch_inference)))
+                related_information=""
+                for info,clip_name in zip(all_info,most_related_clips):
+                    clip_conversation=""
+                    general_sum=""
+                    for key in external_memory.documents.keys():
+                        if clip_name in key and 'caption' in key:
+                            general_sum="Clip Summary: "+external_memory.documents[key]
+                        if clip_name in key and 'subtitle' in key:
+                            clip_conversation="Clip Subtitles: "+external_memory.documents[key]
+                    if args.use_coherent_description:
+                        related_information+=f"question_related_information: {info},{general_sum}\n"
+                    else:
+                        # related_information+=f"{general_sum},{clip_conversation},question_related_information: {info}\n"
+                        # related_information+=f"question_related_information: {info},{clip_conversation}\n"
+                        if args.model_summary_only:
+                            related_information+=f"{general_sum},question_related_information: {info}\n"
+                        elif args.info_only:
+                            related_information+=f"question_related_information: {info}\n"
+                        elif args.subtitles_only:
+                            related_information+=f"{clip_conversation},question_related_information: {info}\n"
+                        else:
+                            related_information+=f"{general_sum},{clip_conversation},question_related_information: {info}\n"
+                questions.append(question)
+                related_information_list.append(related_information)
+                related_context_keys.append(related_information)
+                related_context_keys_list_new.append(related_context_keys)
+                if len(questions)< args.batch_size:
+                    continue
+                setup_seeds(seed)
+                if args.use_chatgpt :
+                    batch_pred=self.inference_RAG_chatGPT(questions, related_information_list)
+                else:
+                    batch_pred=self.inference_RAG(questions, related_information_list)
+                for pred in batch_pred:
+                    total_batch_pred.append(pred)
+                questions=[]
+                related_information_list=[]
+            if len(questions)>0:
+                setup_seeds(seed)
+                if args.use_chatgpt :
+                    batch_pred=self.inference_RAG_chatGPT(questions, related_information_list)
+                else:
+                    batch_pred=self.inference_RAG(questions, related_information_list)
+                for pred in batch_pred:
+                    total_batch_pred.append(pred)
+            return total_batch_pred,related_context_keys_list_new
+    def define_save_name(self):
+        save_name="subtitles" if args.index_subtitles_together else "no_subtitles"
+        save_name+="_clips_for_info" if args.use_clips_for_info else ""
+        save_name+="_chatgpt" if args.use_chatgpt else ""
+        save_name+="_vision_only" if args.vision_only else ""
+        save_name+="_model_summary_only" if args.model_summary_only else ""
+        save_name+="_subtitles_only" if args.subtitles_only else ""
+        save_name+="_choices_for_info" if args.use_choices_for_info else ""
+        save_name+="_unknown" if args.add_unknown else ""
+        save_name+="_info_only" if args.info_only else ""
+        print("save_name",save_name)
+        return save_name
+    def eval_movie_qa(self):
+        ## Movie QA evaluation
+        full_questions_result=[]
+        movie_number=0
+        start=args.start
+        end=args.end
+        for movie in tqdm(self.movies_dict.keys()):
+            # if the movie has no answer, skip it
+            if self.movies_dict[movie][0]['answer'] is None:
+                continue
+            if args.start <=movie_number < args.end:
+                save_name=self.define_save_name()
+                save_dir=f"new_workspace/results/movie_qa/{args.exp_name}/{save_name}_{args.neighbours}_neighbours"
+                if os.path.exists( f"{save_dir}/{movie}.json" ):
+                    print(f"Movie {movie} already processed")
+                    with open(f"{save_dir}/{movie}.json", 'r') as f:
+                        pred_json = json.load(f)
+                    full_questions_result.extend(pred_json)
+                    continue
+                use_subtitles_while_generating_summary=not args.vision_only
+                information_RAG_path,embedding_path=self.movie_inference(movie,use_subtitles_while_generating_summary)
+                external_memory=MemoryIndex(args.neighbours, use_openai=args.use_openai_embedding)
+                if os.path.exists(embedding_path):
+                    external_memory.load_embeddings_from_pkl(embedding_path)
+                else:
+                    external_memory.load_documents_from_json(information_RAG_path,emdedding_path=embedding_path)
+                os.makedirs(save_dir, exist_ok=True)
+                pred_json=[]
+                batch_questions=[]
+                for qa in tqdm(self.movies_dict[movie]):
+                    batch_questions.append(qa)
+                    if len(batch_questions)<args.batch_size:
+                        continue
+                    model_ans,related_text=self.answer_movie_questions_RAG(batch_questions,external_memory)
+                    for qa,ans,related_info in zip(batch_questions,model_ans,related_text):
+                        qa.update({'pred':ans})
+                        qa.update({'related_info':related_info})
+                        pred_json.append(qa)
+                    batch_questions=[]
+                if len(batch_questions)>0:
+                    model_ans,related_text=self.answer_movie_questions_RAG(batch_questions,external_memory)
+                    for qa,ans,related_info in zip(batch_questions,model_ans,related_text):
+                        qa.update({'pred':ans})
+                        qa.update({'related_info':related_info})
+                        pred_json.append(qa)
+                full_questions_result.extend(pred_json)
+                with open(f"{save_dir}/{movie}.json", 'w') as fp:
+                    json.dump(pred_json, fp)
+                print(f"Movie {movie} prediction saved to {save_dir}/{movie}_pred_{args.neighbours}.json")
+            movie_number+=1
+        with open(f"{save_dir}/full_pred_s{start}_end{end}.json", 'w') as fp:
+            json.dump(full_questions_result, fp)
+args=get_arguments()
+def setup_seeds(seed):
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed(seed)
+    cudnn.benchmark = False
+    cudnn.deterministic = True
+import yaml
+with open('test_configs/llama2_test_config.yaml') as file:
+    config = yaml.load(file, Loader=yaml.FullLoader)
+seed=config['run']['seed']
+print("seed",seed)
+if __name__ == "__main__":
+    setup_seeds(seed)
+    movie_qa_eval=MovieQAEval(args)
+    movie_qa_eval.eval_movie_qa()

evaluation/eval_goldfish_tvqa_long.py ADDED Viewed

	@@ -0,0 +1,535 @@

+import sys
+import os
+project_dir = os.getcwd()
+sys.path.append(project_dir)
+import json
+from tqdm import tqdm
+from goldfish_lv import GoldFish_LV,split_subtitles,time_to_seconds
+import argparse
+import json
+import argparse
+import torch
+import re
+from tqdm import tqdm
+from PIL import Image
+# from openai import OpenAI
+from index import MemoryIndex
+import pysrt
+import chardet
+import torch
+import random
+import numpy as np
+import torch.backends.cudnn as cudnn
+def str2bool(v):
+    if isinstance(v, bool):
+        return v
+    if v.lower() in ('yes', 'true', 't', 'y', '1'):
+        return True
+    elif v.lower() in ('no', 'false', 'f', 'n', '0'):
+        return False
+    else:
+        raise argparse.ArgumentTypeError('Boolean value expected.')
+def get_arguments():
+    parser = argparse.ArgumentParser(description="Inference parameters")
+    parser.add_argument("--neighbours", type=int, default=-1)
+    parser.add_argument("--name", type=str,default="ckpt_92",help="name of the experiment")
+    parser.add_argument("--exp_name", type=str,default="",help="name of the experiment")
+    parser.add_argument("--add_unknown", action='store_true')
+    parser.add_argument("--use_chatgpt", action='store_true')
+    parser.add_argument("--use_choices_for_info", action='store_true')
+    parser.add_argument("--use_gt_information", action='store_true')
+    parser.add_argument("--inference_text", action='store_true')
+    parser.add_argument("--use_gt_information_with_distraction", action='store_true')
+    parser.add_argument("--num_distraction", type=int, default=2)
+    parser.add_argument("--add_confidance_score", action='store_true')
+    parser.add_argument("--use_original_video", action='store_true')
+    parser.add_argument("--use_video_embedding", action='store_true')
+    parser.add_argument("--use_clips_for_info", action='store_true')
+    parser.add_argument("--use_GT_video", action='store_true')
+    parser.add_argument("--use_gt_summary", action='store_true')
+    parser.add_argument("--index_subtitles_together", action='store_true')
+    parser.add_argument("--ask_the_question_early", action='store_true')
+    parser.add_argument("--clip_in_ask_early", action='store_true')
+    parser.add_argument("--use_coherent_description", action='store_true')
+    parser.add_argument("--start", default=0, type=int)
+    parser.add_argument("--end", default=100000, type=int)
+    parser.add_argument("--vision_only", action='store_true')
+    parser.add_argument("--model_summary_only", action='store_true')
+    parser.add_argument("--subtitles_only", action='store_true')
+    parser.add_argument("--subtitles_only_after_retrieval", action='store_true')
+    parser.add_argument("--info_only", action='store_true')
+    parser.add_argument("--cfg-path", default="test_configs/llama2_test_config.yaml")
+    parser.add_argument("--ckpt", type=str, default="checkpoints/video_llama_checkpoint_last.pth")
+    parser.add_argument("--add_subtitles", action='store_true')
+    parser.add_argument("--eval_opt", type=str, default='all')
+    parser.add_argument("--max_new_tokens", type=int, default=300)
+    parser.add_argument("--batch_size", type=int, default=8)
+    parser.add_argument("--lora_r", type=int, default=64)
+    parser.add_argument("--lora_alpha", type=int, default=16)
+    parser.add_argument("--video_path", type=str, help="path to the video")
+    parser.add_argument("--use_openai_embedding",type=str2bool, default=False)
+    parser.add_argument("--annotation_path", type=str, help="path to the annotation file")
+    parser.add_argument("--videos_frames", type=str, help="path to the dataset extracted frames")
+    parser.add_argument("--tvqa_json_subtitles", type=str, help="path to the tvqa json subtitles")
+    parser.add_argument("--tvqa_clips_subtitles", type=str, help="path to the tvqa json")
+    parser.add_argument("--options", nargs="+")
+    return parser.parse_args()
+def clean_text(subtitles_text):
+    # Remove unwanted characters except for letters, digits, and single quotes
+    subtitles_text = re.sub(r'[^a-zA-Z0-9\s\']', '', subtitles_text)
+    # Replace multiple spaces with a single space
+    subtitles_text = re.sub(r'\s+', ' ', subtitles_text)
+    return subtitles_text.strip()
+class TVQAEVAL (GoldFish_LV):
+    def __init__(self, args: argparse.Namespace) -> None:
+        super().__init__(args)
+        self.tv_shows_mapping={"Grey's Anatomy":"grey_frames", 'How I Met You Mother':"met_frames", 'Friends':"friends_frames", 'The Big Bang Theory':"bbt_frames", 'House M.D.':"house_frames", 'Castle':"castle_frames"}
+        self.save_long_videos_path = f"new_workspace/clips_summary/tvqa"
+        if args.use_openai_embedding:
+            self.save_embedding_path = f"new_workspace/open_ai_embedding/tvqa"
+        else:
+            self.save_embedding_path = f"new_workspace/embedding/tvqa"
+        os.makedirs(self.save_long_videos_path, exist_ok=True)
+        self.max_sub_len=400
+        self.max_num_images=45
+        self.fps=3
+        with open(args.tvqa_json_subtitles) as f:
+            self.subtitles_list=json.load(f)
+        self.subtitles={}
+        for sub in self.subtitles_list:
+            self.subtitles[sub["vid_name"]]=sub["sub"]
+    def _get_TVs_data(self):
+        json_file_path=args.annotation_path
+        frames_path=args.videos_frames
+        subtitle_path=args.tvqa_clips_subtitles
+        with open (json_file_path) as f:
+            tv_shows_data=json.load(f)
+        return tv_shows_data,frames_path,subtitle_path
+    def _get_shows_subtitles(self,clip_subtitles_path):
+        try :
+            with open(clip_subtitles_path, 'rb') as f:
+                result = chardet.detect(f.read())
+            clip_subtitles = pysrt.open(clip_subtitles_path, encoding=result['encoding'])
+            return clip_subtitles
+        except:
+            print("No subtitles found")
+            return []
+    def episode_inference(self,clips,folder_name,use_subtitles):
+        max_caption_index = 0
+        max_subtitle_index = 0
+        preds={}
+        important_data = {}
+        videos_summaries=[]
+        batch_size=args.batch_size
+        batch_images=[]
+        batch_instructions=[]
+        conversations=[]
+        clips_names=[]
+        for clip_name in tqdm(clips,desc="Inference Episode clips"):
+            conversation=""
+            try:
+                for subtitle in self.subtitles[clip_name]:
+                    conversation+=subtitle['text']+" "
+            except:
+                pass
+            conversations.append(clean_text(conversation))
+            images,img_placeholder=self.prepare_input_images(clip_name,folder_name,use_subtitles)
+            instruction = img_placeholder + '\n' + self.summary_instruction
+            batch_images.append(images)
+            batch_instructions.append(instruction)
+            clips_names.append(clip_name)
+            if len(batch_images) < batch_size:
+                continue
+            batch_images = torch.stack(batch_images)
+            batch_pred=self.run_images(batch_images,batch_instructions)
+            for i,pred in enumerate(batch_pred):
+                max_caption_index += 1
+                videos_summaries.append(pred)
+                if args.use_coherent_description:
+                    preds[f'caption_{max_caption_index}__{clips_names[i]}'] = f"model_summary :{pred}\nVideo conversation :{conversations[i]}"
+                else:
+                    if args.index_subtitles_together and use_subtitles:
+                        if conversations[i] != "":
+                            max_subtitle_index+=1
+                            important_data.update({f"subtitle_{max_subtitle_index}__{clips_names[i]}": conversations[i]})
+                    preds[f'caption_{max_caption_index}__{clips_names[i]}'] = pred
+            batch_images=[]
+            batch_instructions=[]
+            clips_names=[]
+            conversations=[]
+        # run inference for the last batch
+        if len(batch_images)>0:
+            batch_images = torch.stack(batch_images)
+            batch_pred=self.run_images(batch_images,batch_instructions)
+            for i,pred in enumerate(batch_pred):
+                max_caption_index += 1
+                videos_summaries.append(pred)
+                if args.use_coherent_description:
+                    preds[f'caption_{max_caption_index}__{clips_names[i]}'] = f"model_summary :{pred}\nVideo conversation :{conversations[i]}"
+                else:
+                    if args.index_subtitles_together and use_subtitles:
+                        if conversations[i] != "":
+                            max_subtitle_index+=1
+                            important_data.update({f"subtitle_{max_subtitle_index}__{clips_names[i]}": conversations[i]})
+                    preds[f'caption_{max_caption_index}__{clips_names[i]}'] = pred
+            batch_images=[]
+            batch_instructions=[]
+            clips_names=[]
+        return preds,important_data
+    def episode_inference_only_subtitles(self,clips,tv_images_path,subtitle_path):
+        max_subtitle_index = 0
+        important_data = {}
+        for c_name in tqdm(clips,desc="Inference Episode clips"):
+            clip_subtitles_path=os.path.join(subtitle_path,c_name+".srt")
+            clip_subtitles=self._get_shows_subtitles(clip_subtitles_path)
+            conversation=""
+            if args.index_subtitles_together:
+                if self.subtitles.get(c_name,False):
+                    for subtitle in self.subtitles[c_name]:
+                        conversation+=subtitle['text']+" "
+                    conversation=clean_text(conversation)
+                    if conversation != "":
+                        max_subtitle_index+=1
+                        important_data.update({f"subtitle_{max_subtitle_index}__{c_name}": conversation})
+        return important_data
+    def prepare_input_images(self,clip_name,folder_name,use_subtitles):
+        tv_shows_data,frames_path,subtitle_path=self._get_TVs_data()
+        tv_images_path =os.path.join(frames_path,folder_name)
+        clip_path=os.path.join(tv_images_path,clip_name)
+        total_frames=len(os.listdir(clip_path))
+        sampling_interval=int(total_frames//self.max_num_images)
+        if sampling_interval==0:
+            sampling_interval=1
+        images=[]
+        img_placeholder = ""
+        video_frames_path = os.path.join(frames_path,folder_name,clip_name)
+        total_num_frames=len(os.listdir(video_frames_path))
+        sampling_interval = round(total_num_frames / self.max_num_images)
+        if sampling_interval == 0:
+            sampling_interval = 1
+        subtitle_text_in_interval = ""
+        history_subtitles = {}
+        number_of_sub_words=0
+        for i,frame in enumerate(sorted(os.listdir(video_frames_path))):
+            # Find the corresponding subtitle for the frame and combine the interval subtitles into one subtitle
+            # we choose 1 frame for every 2 seconds,so we need to combine the subtitles in the interval of 2 seconds
+            if self.subtitles.get(clip_name,False) and use_subtitles:
+                for subtitle in self.subtitles[clip_name]:
+                    if (subtitle['start'] <= (i / self.fps) <= subtitle['end']) and subtitle['text'] not in subtitle_text_in_interval:
+                        if not history_subtitles.get(subtitle['text'],False):
+                            subtitle_text_in_interval+=subtitle['text']+" "
+                        history_subtitles[subtitle['text']]=True
+                        break
+            if i % sampling_interval == 0:
+                frame = Image.open(os.path.join(video_frames_path,frame)).convert("RGB")
+                frame = self.vis_processor(frame)
+                images.append(frame)
+                img_placeholder += '<Img><ImageHere>'
+                if number_of_sub_words<self.max_sub_len and use_subtitles:
+                    if subtitle_text_in_interval != "":
+                        subtitle_text_in_interval=clean_text(subtitle_text_in_interval)
+                        img_placeholder+=f'<Cap>{subtitle_text_in_interval}'
+                        number_of_sub_words+=len(subtitle_text_in_interval.split(' '))
+                        subtitle_text_in_interval = ""
+            if len(images) >= self.max_num_images:
+                break
+        if len(images) ==0:
+            print("Video not found",video_frames_path)
+        if 0 <len(images) < self.max_num_images:
+            last_item = images[-1]
+            while len(images) < self.max_num_images:
+                images.append(last_item)
+                img_placeholder += '<Img><ImageHere>'
+        images = torch.stack(images)
+        return images,img_placeholder
+    def clip_inference(self,clips_name,folders_name,prompts):
+        setup_seeds(seed)
+        images_batch, instructions_batch = [], []
+        for clip_name,folder_name, prompt in zip(clips_name,folders_name, prompts):
+            images,img_placeholder=self.prepare_input_images(clip_name,folder_name,use_subtitles=not args.vision_only)
+            instruction = img_placeholder + '\n' + prompt
+            images_batch.append(images)
+            instructions_batch.append(instruction)
+        # run inference for the batch
+        images_batch=torch.stack(images_batch)
+        batch_pred=self.run_images(images_batch,instructions_batch)
+        return batch_pred
+    def prepare_prompt(self,qa):
+        prompt=qa["q"]+" \n\n As you watched in this video Choose ONE suitable answer from these mutiple choices \n"
+        for i,choice in enumerate(["a0","a1","a2","a3","a4"]):
+            prompt+=f"option {i}: {qa[choice]} \n"
+        if args.add_unknown and args.add_confidance_score:
+            # Add unknown option
+            prompt+=f"option 5: Can't answer based on the provided information\n"
+            prompt+="\n Your output should be THE NUMBER OF THE CORRECT ANSWER FROM THE CHOICES FROM 0 TO 5 INCLUSIVE and aslo output a CONFIDANCE SCORE FROM 0 TO 5 representing how confident you are with your answer where 0 is the least confident and 5 is the most confident"
+        elif args.add_unknown:
+            prompt+=f"option 5: Can't answer based on the provided information\n"
+            prompt+="\n Your output should be THE NUMBER OF THE CORRECT ANSWER FROM THE CHOICES FROM 0 TO 5 INCLUSIVE"
+        elif args.add_confidance_score:
+            prompt+="\n Your output should be THE NUMBER OF THE CORRECT ANSWER FROM THE CHOICES FROM 0 TO 4 INCLUSIVE and aslo output a CONFIDANCE SCORE FROM 0 TO 5 representing how confident you are with your answer where 0 is the least confident and 5 is the most confident"
+        else:
+            prompt+="\n Your output should be THE NUMBER OF THE CORRECT ANSWER FROM THE CHOICES FROM 0 TO 4 INCLUSIVE"
+        return prompt
+    def get_most_related_clips(self,qa,related_context_keys):
+        if args.use_gt_information:
+            most_related_clips=[qa['vid_name']]
+        elif args.use_gt_information_with_distraction:
+            most_related_clips=[qa['vid_name']]
+            for context_key in related_context_keys:
+                if len(context_key.split('__'))>1:
+                    most_related_clips.append(context_key.split('__')[1])
+                if len(most_related_clips)==args.num_distraction+1:
+                    break
+        else:
+            most_related_clips=[]
+            for context_key in related_context_keys:
+                if len(context_key.split('__'))>1:
+                    most_related_clips.append(context_key.split('__')[1])
+                if len(most_related_clips)==args.neighbours:
+                    break
+        assert len(most_related_clips)!=0, f"No related clips found {related_context_keys}"
+        return most_related_clips
+    def use_clips_for_info(self,qa_list,related_context_keys_list,external_memory):
+        total_batch_pred=[]
+        questions=[]
+        related_information_list=[]
+        related_context_keys_list_new=[]
+        for qa,related_context_keys in zip(qa_list,related_context_keys_list):
+            most_related_clips=self.get_most_related_clips(qa,related_context_keys)
+            folder_name=self.tv_shows_mapping[qa['show_name']]
+            question=qa['q']+ "\nand these are the choices :\n"
+            for i,choice in enumerate(["a0","a1","a2","a3","a4"]):
+                question+=f"option {i}: {qa[choice]} \n"
+            if args.add_unknown:
+                question+= "option 5: Can't answer based on the provided information\n"
+                question+="\n Your output should be THE NUMBER OF THE CORRECT ANSWER FROM THE CHOICES FROM 0 TO 5 INCLUSIVE"
+            else:
+                question+="\n Your output should be THE NUMBER OF THE CORRECT ANSWER FROM THE CHOICES FROM 0 TO 4 INCLUSIVE"
+            if args.use_choices_for_info:
+                # prompt=self.prepare_prompt(qa)
+                # prompt+=" and also provide an EXPLAINATION for your answer and If you don't know the answer, say that you don't know.\n\n"
+                prompt=f"From this video extract the related information to This multichioce question and provide an explaination for your answer and If you don't know the answer, say 'I DON'T KNOW' as option 5 because maybe the questoin is not related to the video content.\n the question is :\n {question}\n your answer :"
+            else:
+                prompt=f"As you watched in this video answer this {qa['q']}\n\n and also provide an EXPLAINATION for your answer and If you don't know the answer, say that you don't know.\n\n"
+            all_info=self.clip_inference(most_related_clips,[folder_name]*len(most_related_clips),[prompt]*len(most_related_clips))
+            # concatinate all the information together
+            related_information=""
+            for info,clip_name in zip(all_info,most_related_clips):
+                clip_conversation=""
+                general_sum=""
+                for key in external_memory.documents.keys():
+                    if clip_name in key and 'caption' in key:
+                        general_sum="Clip Summary: "+external_memory.documents[key]
+                    if clip_name in key and 'subtitle' in key:
+                        clip_conversation="Clip Subtitles: "+external_memory.documents[key]
+                if args.use_coherent_description:
+                    related_information+=f"question_related_information: {info},{general_sum}\n"
+                else:
+                    # related_information+=f"{general_sum},{clip_conversation},question_related_information: {info}\n"
+                    # related_information+=f"question_related_information: {info},{clip_conversation}\n"
+                    if args.model_summary_only:
+                            related_information+=f"{general_sum},question_related_information: {info}\n"
+                    elif args.info_only:
+                        related_information+=f"question_related_information: {info}\n"
+                    elif args.subtitles_only:
+                        related_information+=f"{clip_conversation},question_related_information: {info}\n"
+                    elif args.subtitles_only_after_retrieval:
+                        related_information+=f"{clip_conversation},question_related_information: {info}\n"
+                    else:
+                        related_information+=f"{general_sum},{clip_conversation},question_related_information: {info}\n"
+            questions.append(question)
+            related_information_list.append(related_information)
+            related_context_keys.append(related_information)
+            related_context_keys_list_new.append(related_context_keys)
+            if len(questions)< args.batch_size:
+                continue
+            setup_seeds(seed)
+            if args.use_chatgpt :
+                batch_pred=self.inference_RAG_chatGPT(questions, related_information_list)
+            else:
+                batch_pred=self.inference_RAG(questions, related_information_list)
+            for pred in batch_pred:
+                total_batch_pred.append(pred)
+            questions=[]
+            related_information_list=[]
+        if len(questions)>0:
+            setup_seeds(seed)
+            if args.use_chatgpt :
+                batch_pred=self.inference_RAG_chatGPT(questions, related_information_list)
+            else:
+                batch_pred=self.inference_RAG(questions, related_information_list)
+            for pred in batch_pred:
+                total_batch_pred.append(pred)
+        return total_batch_pred,related_context_keys_list_new
+    def answer_TV_questions_RAG(self,qa_list,external_memory,episode_clips,episode_name):
+        related_context_keys_list,related_context_documents_list=[],[]
+        setup_seeds(seed)
+        for qa in qa_list:
+            question_choices=qa['q']+ "\n and these are the options for the question\n\n"
+            for i,choice in enumerate(["a0","a1","a2","a3","a4"]):
+                question_choices+=f"option {i}: {qa[choice]} \n\n"
+            related_context_documents,related_context_keys = external_memory.search_by_similarity(question_choices)
+            related_context_documents_list.append(related_context_documents)
+            related_context_keys_list.append(related_context_keys)
+        if args.use_clips_for_info:
+            batch_pred,related_context_keys_list=self.use_clips_for_info(qa_list,related_context_keys_list,external_memory)
+        else:
+            prompts=[]
+            related_context_documents_text_list=[]
+            for qa,related_context_documents,related_context_keys in zip(qa_list,related_context_documents_list,related_context_keys_list):
+                related_information=""
+                most_related_clips=self.get_most_related_clips(qa,related_context_keys)
+                for clip_name in most_related_clips:
+                    clip_conversation=""
+                    general_sum=""
+                    for key in external_memory.documents.keys():
+                        if clip_name in key and 'caption' in key:
+                            general_sum="Clip Summary: "+external_memory.documents[key]
+                        if clip_name in key and 'subtitle' in key:
+                            clip_conversation="Clip Subtitles: "+external_memory.documents[key]
+                    # related_information+=f"{general_sum},{clip_conversation}\n"
+                    if args.use_coherent_description:
+                        related_information+=f"{general_sum}\n"
+                    else:
+                        if args.model_summary_only:
+                            related_information+=f"{general_sum}\n"
+                        elif args.subtitles_only:
+                            related_information+=f"{clip_conversation}\n"
+                        else:
+                            related_information+=f"{general_sum},{clip_conversation}\n"
+                prompt=self.prepare_prompt(qa)
+                prompts.append(prompt)
+                related_context_documents_text_list.append(related_information)
+            setup_seeds(seed)
+            if args.use_chatgpt:
+                batch_pred=self.inference_RAG_chatGPT(prompts, related_context_documents_text_list)
+            else:
+                batch_pred=self.inference_RAG(prompts, related_context_documents_text_list)
+        return batch_pred ,related_context_keys_list
+    def answer_episode_questions(self,questions,information_RAG_path,embedding_path,episode_clips):
+        external_memory=MemoryIndex(args.neighbours, use_openai=args.use_openai_embedding)
+        if os.path.exists(embedding_path):
+            external_memory.load_embeddings_from_pkl(embedding_path)
+        else:
+            external_memory.load_documents_from_json(information_RAG_path,embedding_path)
+        episode_name=information_RAG_path.split('/')[-1].split('.')[0]
+        pred_json=[]
+        batch_questions=[]
+        for qa in tqdm(questions,desc="Answering questions"):
+            batch_questions.append(qa)
+            if len(batch_questions)<args.batch_size:
+                continue
+            batch_pred,batch_related_context_keys = self.answer_TV_questions_RAG(batch_questions,external_memory,episode_clips,episode_name)
+            for pred,related_context_keys,qa in zip(batch_pred,batch_related_context_keys,batch_questions):
+                qa['pred']=pred
+                qa['related_context_keys']=related_context_keys
+                pred_json.append(qa)
+            batch_questions=[]
+        if len(batch_questions)>0:
+            batch_pred,batch_related_context_keys = self.answer_TV_questions_RAG(batch_questions,external_memory,episode_clips,episode_name)
+            for pred,related_context_keys,qa in zip(batch_pred,batch_related_context_keys,batch_questions):
+                qa['pred']=pred
+                qa['related_context_keys']=related_context_keys
+                pred_json.append(qa)
+        return pred_json
+    def eval_tv_shows(self,):
+        tv_shows_data,frames_path,subtitle_path=self._get_TVs_data()
+        full_questions_result=[]
+        number_of_episodes=0
+        start=args.start
+        end=args.end
+        for show in tqdm(tv_shows_data,desc="Inference TV shows"):
+            for season in tqdm(tv_shows_data[show],desc=f"Inference {show} seasons"):
+                for episode in tqdm(tv_shows_data[show][season],desc=f"Inference {show} {season} episodes"):
+                    # Generate clips summary and store the important data (summary and subtitles) in json file
+                    if start<=number_of_episodes<end:
+                        folder_name=self.tv_shows_mapping[show]
+                        tv_images_path =os.path.join(frames_path,folder_name)
+                        os.makedirs(self.save_long_videos_path, exist_ok=True)
+                        save_name="" if args.index_subtitles_together else "no_subtitles_"
+                        save_name="subtitles_only" if args.subtitles_only else save_name
+                        save_name="use_coherent_description" if args.use_coherent_description else save_name
+                        file_path=os.path.join(self.save_long_videos_path,save_name+folder_name+"_"+season+"_"+episode+".json")
+                        embedding_path=os.path.join(self.save_embedding_path,save_name+folder_name+"_"+season+"_"+episode+".pkl")
+                        # options don't require rerunning the inference
+                        save_name+="_unknown" if args.add_unknown else ""
+                        save_name+="_clips_for_info" if args.use_clips_for_info else ""
+                        save_name+="_chatgpt" if args.use_chatgpt else ""
+                        save_name+="_choices_for_info" if args.use_choices_for_info else ""
+                        save_name+="_info_only" if args.info_only else ""
+                        save_name+="_subtitles_only" if args.subtitles_only else ""
+                        save_name+="_subtitles_only_after_retrieval" if args.subtitles_only_after_retrieval else ""
+                        if os.path.exists(file_path):
+                            with open(file_path, 'r') as file:
+                                important_data = json.load(file)
+                            print("Already processed")
+                        else:
+                            episode_clips=tv_shows_data[show][season][episode]['clips']
+                            if args.subtitles_only :
+                                important_data=self.episode_inference_only_subtitles(episode_clips,tv_images_path,subtitle_path)
+                            else:
+                                preds,important_data=self.episode_inference(episode_clips,folder_name,use_subtitles=not args.vision_only)
+                                important_data.update(preds)
+                            # if not args.subtitles_only :
+                            #     summary = self.compine_summaries(important_data)
+                            #     preds['summary'] = summary
+                            #     important_data["summary"]=summary
+                            with open(file_path, 'w') as file:
+                                json.dump(important_data, file, indent=4)
+                        # Answer questions
+                        questions=tv_shows_data[show][season][episode]['questions']
+                        episode_clips=tv_shows_data[show][season][episode]['clips']
+                        episode_name=file_path.split('/')[-1].split('.')[0]
+                        pred_json=self.answer_episode_questions(questions,file_path,embedding_path,episode_clips)
+                        full_questions_result.extend(pred_json)
+                        save_dir=f"new_workspace/results/tvqa/{args.exp_name}/{save_name}_{args.neighbours}_neighbours"
+                        os.makedirs(save_dir, exist_ok=True)
+                        with open(f"{save_dir}/{episode_name}.json", 'w') as fp:
+                            json.dump(pred_json, fp)
+                        print(f"Episode {episode_name} prediction saved to {save_dir}/{episode_name}_pred_{args.neighbours}.json")
+                    number_of_episodes+=1
+        with open(f"{save_dir}/full_pred_{start}_{end}.json", 'w') as fp:
+            json.dump(full_questions_result, fp)
+        print(f"TV shows prediction saved to {save_dir}/full_pred_{start}{end}.json")
+args=get_arguments()
+def setup_seeds(seed):
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed(seed)
+    cudnn.benchmark = False
+    cudnn.deterministic = True
+import yaml
+with open('test_configs/llama2_test_config.yaml') as file:
+    config = yaml.load(file, Loader=yaml.FullLoader)
+seed=config['run']['seed']
+print("seed",seed)
+if __name__ == "__main__":
+    setup_seeds(seed)
+    tvqa_eval=TVQAEVAL(args)
+    tvqa_eval.eval_tv_shows()

evaluation/eval_minigpt4_video.py ADDED Viewed

	@@ -0,0 +1,201 @@

+import os
+import json
+from tqdm import tqdm
+import sys
+project_dir = os.getcwd()
+sys.path.append(project_dir)
+from torch.utils.data import DataLoader
+from minigpt4.common.eval_utils import prepare_texts, init_model, eval_parser
+from minigpt4.conversation.conversation import CONV_VISION
+from minigpt4.processors.blip_processors import Blip2ImageTrainProcessor,BlipCaptionProcessor
+from minigpt4.datasets.datasets.video_datasets import VideoChatGPTEvalDataset,VideoChatGPTEval_consistancy,Video_validation_Dataset,TVQAEVAL
+parser = eval_parser()
+parser.add_argument("--dataset", type=str, default='msvd', help="dataset to evaluate")
+parser.add_argument("--add_subtitles",action='store_true',help="whether to add subtitles to the video")
+parser.add_argument("--name", type=str, default='test', help="evaluation name")
+parser.add_argument("--videos_path", type=str, default='videos path', help="path to videos")
+parser.add_argument("--subtitles_path", type=str, default='subtitles path', help="path to subtitles")
+parser.add_argument("--ann_path", type=str, default='annotations path', help="path to annotations")
+parser.add_argument("--batch_size", type=int, default=1, help="batch size")
+parser.add_argument("--start", type=int, default=0, help="start from video number")
+parser.add_argument("--end", type=int, default=10000000, help="end at video number")
+args = parser.parse_args()
+print(args.ckpt)
+print(args.name)
+print(args.cfg_path)
+if "test_configs/mistral_test_config.yaml" == args.cfg_path:
+    llm_name="mistral"
+else:
+    llm_name="llama2"
+print("using captions",args.add_subtitles)
+model, vis_processor,whisper_gpu_id,minigpt4_gpu_id,answer_module_gpu_id = init_model(args)
+conv_temp = CONV_VISION.copy()
+conv_temp.system = ""
+if args.dataset == 'video_chatgpt_generic':
+    ann_path=args.ann_path
+    videos_path= args.videos_path
+    subtitles_path=args.subtitles_path
+    annotations_keys=['Q','A','video_name']
+    data = VideoChatGPTEvalDataset(vis_processor, videos_path, ann_path,subtitles_path,annotations_keys, add_subtitles=args.add_subtitles,llm_name=llm_name)
+elif args.dataset == 'video_chatgpt_temporal':
+    ann_path=args.ann_path
+    videos_path= args.videos_path
+    subtitles_path=args.subtitles_path
+    annotations_keys=['Q','A','video_name']
+    data = VideoChatGPTEvalDataset(vis_processor, videos_path, ann_path,subtitles_path,annotations_keys, add_subtitles=args.add_subtitles,llm_name=llm_name)
+elif args.dataset == 'video_chatgpt_consistency':
+    ann_path=args.ann_path
+    videos_path= args.videos_path
+    subtitles_path=args.subtitles_path
+    annotations_keys=[['Q1','Q2'],'A','video_name']
+    data = VideoChatGPTEval_consistancy(vis_processor, videos_path, ann_path,subtitles_path,annotations_keys, add_subtitles=args.add_subtitles,llm_name=llm_name)
+elif args.dataset == 'msrvtt':
+    ann_path=args.ann_path
+    videos_path= args.videos_path
+    subtitles_path=args.subtitles_path
+    annotations_keys=['question','answer','video_id']
+    data = VideoChatGPTEvalDataset(vis_processor, videos_path, ann_path,subtitles_path,annotations_keys, add_subtitles=args.add_subtitles,llm_name=llm_name)
+elif args.dataset == 'msvd':
+    ann_path=args.ann_path
+    videos_path= args.videos_path
+    subtitles_path="" # no subtitles for msvd as these videos don't have audio
+    annotations_keys=['question','answer','video_id']
+    data = VideoChatGPTEvalDataset(vis_processor, videos_path, ann_path,subtitles_path,annotations_keys, add_subtitles=args.add_subtitles,llm_name=llm_name)
+elif args.dataset == 'activitynet':
+    ann_path=args.ann_path
+    videos_path= args.videos_path
+    subtitles_path=args.subtitles_path
+    annotations_keys=['question','answer','video_id']
+    data = VideoChatGPTEvalDataset(vis_processor, videos_path, ann_path,subtitles_path,annotations_keys, add_subtitles=args.add_subtitles,llm_name=llm_name)
+elif args.dataset == 'tgif':
+    ann_path="datasets/evaluation_datasets/tgif/Test_frameqa_question.json"
+    videos_path= args.videos_path
+    subtitles_path="" # no subtitles for TGIF as these videos don't have audio
+    annotations_keys=['question','answer','gif_name']
+    data = VideoChatGPTEvalDataset(vis_processor, videos_path, ann_path,subtitles_path,annotations_keys, add_subtitles=False,llm_name=llm_name)
+elif args.dataset == 'tvqa':
+    # TVQA dataset
+    ann_path="datasets/evaluation_datasets/tvqa_short/tvqa_val.json"
+    videos_path= args.videos_path
+    subtitles_path=args.subtitles_path
+    data = TVQAEVAL(vis_processor, videos_path, ann_path,subtitles_path,add_subtitles=args.add_subtitles,llm_name=llm_name)
+eval_dataloader = DataLoader(data, batch_size=args.batch_size, shuffle=False)
+minigpt4_predict = []
+sub="subtitles" if args.add_subtitles else "no_subtitles"
+if args.start == 0 and args.end == 10000000:
+    save_path = f'results/{args.name}_{args.dataset}_{sub}.json'
+else:
+    print("start from video number",args.start)
+    print("end at video number",args.end)
+    save_path = f'results/{args.name}_{args.dataset}_{sub}_{args.start}_{args.end}.json'
+os.makedirs("results", exist_ok=True)
+c=0
+pred_result = {}
+gt_result = {}
+if args.dataset == 'video_chatgpt_consistency':
+    for images, texts_1,texts_2, gt_answers, lengths,videos_ids in tqdm(eval_dataloader,desc=f"Eval {args.dataset}"):
+        if args.start<= c <args.end :
+            texts_q1 = prepare_texts(texts_1, conv_temp, template='', lengths=lengths)  # warp the texts with conversation template
+            texts_q2 = prepare_texts(texts_2, conv_temp, template='', lengths=lengths)  # warp the texts with conversation template
+            models_answers_q1 = model.generate(images, texts_q1, max_new_tokens=args.max_new_tokens, do_sample=False, lengths=lengths,num_beams=1)
+            models_answers_q2 = model.generate(images, texts_q2, max_new_tokens=args.max_new_tokens, do_sample=False, lengths=lengths,num_beams=1)
+            for video_id,model_answer_q1,model_answer_q2, gt_answer,text_q1,text_q2 in zip(videos_ids,models_answers_q1,models_answers_q2, gt_answers,texts_q1,texts_q2):
+                result = dict()
+                result['video_name'] = video_id
+                result['Q1'] = text_q1.split('\n')[-1].replace('[/INST]','')
+                result['Q2'] = text_q2.split('\n')[-1].replace('[/INST]','')
+                result['A'] = gt_answer
+                result['pred1'] = model_answer_q1
+                result['pred2'] = model_answer_q2
+                pred_result[video_id] = [model_answer_q1,model_answer_q2]
+                gt_result[video_id] = [gt_answer]
+                minigpt4_predict.append(result)
+            # save results every 100 videos to avoid losing results
+            if c%100==0:
+                with open(save_path, 'w') as f:
+                    json.dump(minigpt4_predict, f)
+        if c >= args.end :
+            break
+        c+=1
+elif args.dataset == 'tvr':
+    for images, texts, gt_answers, lengths,videos_ids in tqdm(eval_dataloader,desc=f"Eval {args.dataset}"):
+        if args.start<= c <args.end :
+            texts = prepare_texts(texts, conv_temp, template='', lengths=lengths)  # warp the texts with conversation template
+            models_answers = model.generate(images, texts, max_new_tokens=args.max_new_tokens, do_sample=False, lengths=lengths,num_beams=1)
+            for video_id,model_answer, gt_answer,text in zip(videos_ids,models_answers, gt_answers,texts):
+                result = dict()
+                result['video_name'] = video_id
+                result['Q'] = text.split('\n')[-1].replace('[/INST]','')
+                result['A'] = gt_answer
+                result['pred'] = model_answer
+                pred_result[video_id] = [model_answer]
+                gt_result[video_id] = [gt_answer]
+                minigpt4_predict.append(result)
+            # save results every 100 videos to avoid losing results
+            if c%100==0:
+                with open(save_path, 'w') as f:
+                    json.dump(minigpt4_predict, f)
+        if c >= args.end :
+            break
+        c+=1
+elif args.dataset == 'ego_schema' or args.dataset == 'tvqa' or args.dataset == 'tvqa_long_videos':
+    for images, texts, gt_answers, lengths,videos_ids in tqdm(eval_dataloader,desc=f"Eval {args.dataset}"):
+        if args.start<= c <args.end :
+            texts = prepare_texts(texts, conv_temp, template='', lengths=lengths)  # warp the texts with conversation template
+            models_answers = model.generate(images, texts, max_new_tokens=args.max_new_tokens, do_sample=False, lengths=lengths,num_beams=1)
+            for video_id,model_answer, gt_answer,text in zip(videos_ids,models_answers, gt_answers,texts):
+                result = dict()
+                result['video_name'] = video_id
+                if args.dataset == 'tvqa_long_videos':
+                    result['Q'] = text.split('\n\n')[1:]
+                else:
+                    result['Q'] = text.split('\n')[1:]
+                result['A'] = gt_answer
+                result['pred'] = model_answer
+                pred_result[video_id] = [model_answer]
+                gt_result[video_id] = [gt_answer]
+                minigpt4_predict.append(result)
+            # save results every 100 videos to avoid losing results
+            if c%100==0:
+                with open(save_path, 'w') as f:
+                    json.dump(minigpt4_predict, f)
+        if c >= args.end :
+            break
+        c+=1
+else:
+    for images, texts, gt_answers, lengths,videos_ids in tqdm(eval_dataloader,desc=f"Eval {args.dataset}"):
+        if args.start<= c <args.end :
+            texts = prepare_texts(texts, conv_temp, template='', lengths=lengths)  # warp the texts with conversation template
+            models_answers = model.generate(images, texts, max_new_tokens=args.max_new_tokens, do_sample=False, lengths=lengths,num_beams=1)
+            for video_id,model_answer, gt_answer,text in zip(videos_ids,models_answers, gt_answers,texts):
+                result = dict()
+                result['video_name'] = video_id
+                result['Q'] = text.split('\n')[-1].replace('[/INST]','')
+                result['A'] = gt_answer
+                result['pred'] = model_answer
+                pred_result[video_id] = [model_answer]
+                gt_result[video_id] = [gt_answer]
+                minigpt4_predict.append(result)
+            # save results every 100 videos to avoid losing results
+            if c%100==0:
+                with open(save_path, 'w') as f:
+                    json.dump(minigpt4_predict, f)
+        if c >= args.end :
+            break
+        c+=1
+with open(save_path, 'w') as f:
+    json.dump(minigpt4_predict, f)
+print("saved results to",save_path)

evaluation/eval_retrieval_acc_tvqa.py ADDED Viewed

	@@ -0,0 +1,316 @@

+import sys
+import os
+project_dir = os.getcwd()
+sys.path.append(project_dir)
+import json
+from tqdm import tqdm
+from goldfish_lv import GoldFish_LV,split_subtitles,time_to_seconds
+import argparse
+import json
+import argparse
+import torch
+import re
+from PIL import Image
+# from openai import OpenAI
+from index import MemoryIndex
+import torch
+import random
+import numpy as np
+import torch.backends.cudnn as cudnn
+def get_arguments():
+    parser = argparse.ArgumentParser(description="Inference parameters")
+    parser.add_argument("--neighbours", type=int, default=-1)
+    parser.add_argument("--name", type=str,default="ckpt_92",help="name of the experiment")
+    parser.add_argument("--exp_name", type=str,default="",help="name of the experiment")
+    parser.add_argument("--add_unknown", action='store_true')
+    parser.add_argument("--use_chatgpt", action='store_true')
+    parser.add_argument("--use_choices_for_info", action='store_true')
+    parser.add_argument("--use_gt_information", action='store_true')
+    parser.add_argument("--inference_text", action='store_true')
+    parser.add_argument("--use_gt_information_with_distraction", action='store_true')
+    parser.add_argument("--num_distraction", type=int, default=2)
+    parser.add_argument("--add_confidance_score", action='store_true')
+    parser.add_argument("--use_original_video", action='store_true')
+    parser.add_argument("--use_video_embedding", action='store_true')
+    parser.add_argument("--use_clips_for_info", action='store_true')
+    parser.add_argument("--use_GT_video", action='store_true')
+    parser.add_argument("--use_gt_summary", action='store_true')
+    parser.add_argument("--ask_the_question_early", action='store_true')
+    parser.add_argument("--clip_in_ask_early", action='store_true')
+    parser.add_argument("--use_coherent_description", action='store_true')
+    parser.add_argument("--start", default=0, type=int)
+    parser.add_argument("--end", default=100000, type=int)
+    parser.add_argument("--vision_only", action='store_true')
+    parser.add_argument("--model_summary_only", action='store_true')
+    parser.add_argument("--subtitles_only", action='store_true')
+    parser.add_argument("--subtitles_only_after_retrieval", action='store_true')
+    parser.add_argument("--info_only", action='store_true')
+    parser.add_argument("--cfg-path", default="test_configs/llama2_test_config.yaml")
+    parser.add_argument("--ckpt", type=str, default="checkpoints/video_llama_checkpoint_last.pth")
+    parser.add_argument("--add_subtitles", action='store_true')
+    parser.add_argument("--eval_opt", type=str, default='all')
+    parser.add_argument("--max_new_tokens", type=int, default=300)
+    parser.add_argument("--batch_size", type=int, default=8)
+    parser.add_argument("--lora_r", type=int, default=64)
+    parser.add_argument("--lora_alpha", type=int, default=16)
+    parser.add_argument("--video_path", type=str, help="path to the video")
+    parser.add_argument("--options", nargs="+")
+    return parser.parse_args()
+def clean_text(subtitles_text):
+    # Remove unwanted characters except for letters, digits, and single quotes
+    subtitles_text = re.sub(r'[^a-zA-Z0-9\s\']', '', subtitles_text)
+    # Replace multiple spaces with a single space
+    subtitles_text = re.sub(r'\s+', ' ', subtitles_text)
+    return subtitles_text.strip()
+class TVQAEVALRetrieval (GoldFish_LV):
+    def __init__(self, args: argparse.Namespace) -> None:
+        super().__init__(args)
+        self.tv_shows_mapping={"Grey's Anatomy":"grey_frames", 'How I Met You Mother':"met_frames", 'Friends':"friends_frames", 'The Big Bang Theory':"bbt_frames", 'House M.D.':"house_frames", 'Castle':"castle_frames"}
+        self.save_long_videos_path = f"workspace/results/tv_shows/{args.name}"
+        os.makedirs(self.save_long_videos_path, exist_ok=True)
+        self.max_sub_len=400
+        self.max_num_images=45
+        self.fps=3
+        with open("datasets/evaluation_datasets/goldfish_eval_datasets/tvqa/tvqa_preprocessed_subtitles.json") as f:
+            self.subtitles_list=json.load(f)
+        self.subtitles={}
+        for sub in self.subtitles_list:
+            self.subtitles[sub["vid_name"]]=sub["sub"]
+    def _get_TVs_data(self):
+        json_file_path="datasets/evaluation_datasets/long_video_datasets/tvqa/tvqa_val_edited.json"
+        frames_path="/ibex/project/c2090/datasets/TVR_dataset/videos/video_files/frames_hq/"
+        subtitle_path="/ibex/project/c2090/datasets/TVR_dataset/videos/tvqa_subtitles"
+        with open (json_file_path) as f:
+            tv_shows_data=json.load(f)
+        return tv_shows_data,frames_path,subtitle_path
+        return vision_questions,subtitle_questions,frames_path
+    def episode_inference(self,video_frames_path,qa,use_subtitles):
+        batch_prepared_images,batch_img_placeholder,gt_clip_numbers=self.prepare_input_images(video_frames_path,qa,use_subtitles,n_clips=10)
+        preds={}
+        batch_instructions=[]
+        batch_images=[]
+        important_data = {}
+        conversations=[]
+        clips_numbers=[]
+        for clip_number,images,img_placeholder in zip(range(len(batch_prepared_images)),batch_prepared_images,batch_img_placeholder):
+            instruction = img_placeholder + '\n' + self.summary_instruction
+            batch_images.append(images)
+            batch_instructions.append(instruction)
+            conv=img_placeholder.replace('<Img><ImageHere>','')
+            conv=conv.replace('<Cap>',' ')
+            conversations.append(conv.strip())
+            clips_numbers.append(clip_number)
+            if len(batch_images) < args.batch_size:
+                continue
+            batch_images = torch.stack(batch_images)
+            setup_seeds(seed)
+            batch_pred=self.run_images(batch_images,batch_instructions)
+            for i,pred in enumerate(batch_pred):
+                if args.use_coherent_description:
+                    preds[f'caption__{clips_numbers[i]}'] = f"model_summary :{pred}\nVideo conversation :{conversations[i]}"
+                else:
+                    if use_subtitles:
+                        if conversations[i] != "":
+                            important_data.update({f"subtitle__{clips_numbers[i]}": conversations[i]})
+                    preds[f'caption__{clips_numbers[i]}'] = pred
+            batch_images=[]
+            batch_instructions=[]
+            conversations=[]
+            clips_numbers=[]
+        # run inference for the last batch
+        if len(batch_images)>0:
+            batch_images = torch.stack(batch_images)
+            batch_pred=self.run_images(batch_images,batch_instructions)
+            for i,pred in enumerate(batch_pred):
+                if args.use_coherent_description:
+                    preds[f'caption__{clips_numbers[i]}'] = f"model_summary :{pred}\nVideo conversation :{conversations[i]}"
+                else:
+                    if use_subtitles:
+                        if conversations[i] != "":
+                            important_data.update({f"subtitle__{clips_numbers[i]}": conversations[i]})
+                    preds[f'caption__{clips_numbers[i]}'] = pred
+            batch_images=[]
+            batch_instructions=[]
+            clips_numbers=[]
+        return preds,important_data ,gt_clip_numbers
+    def episode_inference_only_subtitles(self,video_frames_path,qa):
+        use_subtitles=True
+        batch_prepared_images,batch_img_placeholder,gt_clip_numbers=self.prepare_input_images(video_frames_path,qa,use_subtitles,n_clips=10)
+        important_data = {}
+        for clip_number,img_placeholder in enumerate(batch_img_placeholder) :
+            conv=img_placeholder.replace('<Img><ImageHere>','')
+            conv=conv.replace('<Cap>',' ')
+            conversation=conv.strip()
+            conversation=clean_text(conversation)
+            if conversation != "":
+                important_data.update({f"subtitle__{clip_number}": conversation})
+        return important_data ,gt_clip_numbers
+    def prepare_input_images(self,video_frames_path,qa,use_subtitles,n_clips=10):
+        batch_images=[]
+        batch_img_placeholder = []
+        clip_name=video_frames_path.split('/')[-1]
+        images=[]
+        img_placeholders = []
+        gt_clip_numbers = set()
+        gt_start_time=qa['ts'][0]
+        gt_end_time=qa['ts'][1]
+        total_num_frames=len(os.listdir(video_frames_path))
+        subtitle_text_in_interval = ""
+        history_subtitles = {}
+        number_of_sub_words=0
+        # samples_per_clip = total_num_frames // n_clips
+        samples_per_clip=45
+        clip_num=0
+        for i,frame in enumerate(sorted(os.listdir(video_frames_path))):
+            # Find the corresponding subtitle for the frame and combine the interval subtitles into one subtitle
+            # we choose 1 frame for every 2 seconds,so we need to combine the subtitles in the interval of 2 seconds
+            if self.subtitles.get(clip_name,False) and use_subtitles:
+                for subtitle in self.subtitles[clip_name]:
+                    if (subtitle['start'] <= (i / self.fps) <= subtitle['end']) and subtitle['text'] not in subtitle_text_in_interval:
+                        if not history_subtitles.get(subtitle['text'],False):
+                            subtitle_text_in_interval+=subtitle['text']+" "
+                        history_subtitles[subtitle['text']]=True
+                        break
+            if gt_start_time<=(i/self.fps)<= gt_end_time:
+                    gt_clip_numbers.add(clip_num)
+            if i % samples_per_clip == 0 and i != 0:
+                # here we have one clip , let's sample 45 frames from images array
+                sample_value=len(images)//self.max_num_images
+                if sample_value==0:
+                    sample_value=1
+                frames_indices = [i for i in range(0, len(images), sample_value)]
+                samples_images=[]
+                img_placeholder=''
+                for j in frames_indices:
+                    samples_images.append(images[j])
+                    img_placeholder+=img_placeholders[j]
+                    if len(samples_images) >= self.max_num_images:
+                        break
+                if 0 <len(samples_images) < self.max_num_images:
+                    last_item = samples_images[-1]
+                    while len(samples_images) < self.max_num_images:
+                        samples_images.append(last_item)
+                        img_placeholder += '<Img><ImageHere>'
+                samples_images = torch.stack(samples_images)
+                batch_images.append(samples_images)
+                batch_img_placeholder.append(img_placeholder)
+                img_placeholders =[]
+                images = []
+                clip_num+=1
+            frame = Image.open(os.path.join(video_frames_path,frame)).convert("RGB")
+            frame = self.vis_processor(frame)
+            images.append(frame)
+            img_placeholder = '<Img><ImageHere>'
+            if number_of_sub_words<self.max_sub_len and use_subtitles:
+                if subtitle_text_in_interval != "":
+                    subtitle_text_in_interval=clean_text(subtitle_text_in_interval)
+                    img_placeholder+=f'<Cap>{subtitle_text_in_interval}'
+                    number_of_sub_words+=len(subtitle_text_in_interval.split(' '))
+                    subtitle_text_in_interval = ""
+            img_placeholders.append(img_placeholder)
+        return batch_images,batch_img_placeholder,list(gt_clip_numbers)
+    def test_retrieval(self,indexed_data_path,qa,gt_clip_numbers):
+        external_memory=MemoryIndex(args.neighbours, use_openai=True)
+        external_memory.load_documents_from_json(indexed_data_path)
+        question=qa['desc']
+        related_context_documents,related_context_keys = external_memory.search_by_similarity(question)
+        print(f"related_context_keys {related_context_keys}")
+        print(f"gt_clip_numbers {gt_clip_numbers}")
+        for key in related_context_keys:
+            clip_idx=int(key.split('__')[-1])
+            if clip_idx in gt_clip_numbers:
+                return True
+        return False
+    def get_ground_truth_clip(self,video_frames_path,qa):
+        gt_clip_numbers = set()
+        gt_start_time=qa['ts'][0]
+        gt_end_time=qa['ts'][1]
+        samples_per_clip=45
+        clip_num=0
+        for i in range(len(os.listdir(video_frames_path))):
+            if gt_start_time<=(i/self.fps)<= gt_end_time:
+                    gt_clip_numbers.add(clip_num)
+            if i % samples_per_clip == 0 and i != 0:
+                clip_num+=1
+        return list(gt_clip_numbers)
+    def eval_tv_shows(self,):
+        vision_questions,subtitle_questions,frames_path=self._get_TVs_data()
+        number_of_videos=0
+        start=args.start
+        end=args.end
+        if args.exp_name=="vision":
+            questions=vision_questions
+        else:
+            questions=subtitle_questions
+        correct_retrieval=0
+        wrong_retrieval=0
+        for qa in questions:
+            # Generate clips summary and store the important data (summary and subtitles) in json file
+            if start<=number_of_videos<end:
+                show_name=qa['vid_name'].split('_')[0]
+                if self.tv_shows_mapping.get(show_name,False):
+                    folder_name=self.tv_shows_mapping[show_name]
+                else:
+                    folder_name=self.tv_shows_mapping['bbt']
+                clip_frames_path =os.path.join(frames_path,folder_name,qa['vid_name'])
+                save_name="subtitles_only" if args.subtitles_only else "vision_only" if args.vision_only else "vision_subtitles"
+                indexed_data_path=os.path.join(self.save_long_videos_path,f"{qa['vid_name']}_{args.exp_name}_{save_name}_num_{number_of_videos}.json")
+                if not os.path.exists(indexed_data_path):
+                    if args.subtitles_only :
+                        # TODO
+                        important_data,gt_clip_numbers=self.episode_inference_only_subtitles(clip_frames_path,qa)
+                    else:
+                        preds,important_data ,gt_clip_numbers=self.episode_inference(clip_frames_path,qa,use_subtitles=not args.vision_only)
+                        important_data.update(preds)
+                    with open(indexed_data_path, 'w') as file:
+                        json.dump(important_data, file, indent=4)
+                else:
+                    gt_clip_numbers=self.get_ground_truth_clip(clip_frames_path,qa)
+                retrieval_res=self.test_retrieval(indexed_data_path,qa,gt_clip_numbers)
+                if retrieval_res==True:
+                    correct_retrieval+=1
+                else:
+                    wrong_retrieval+=1
+            number_of_videos+=1
+        save_dir=f"workspace/eval/retrieval/{args.exp_name}_neighbors_{args.neighbours}"
+        save_dir+="_subtitles_only" if args.subtitles_only else "_vision_only" if args.vision_only else "_vision_subtitles"
+        os.makedirs(save_dir,exist_ok=True)
+        with open(f"{save_dir}/s{start}_end{end}.json", 'w') as fp:
+            json.dump({"correct":correct_retrieval,"wrong":wrong_retrieval}, fp)
+args=get_arguments()
+def setup_seeds(seed):
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed(seed)
+    cudnn.benchmark = False
+    cudnn.deterministic = True
+import yaml
+with open('test_configs/llama2_test_config.yaml') as file:
+    config = yaml.load(file, Loader=yaml.FullLoader)
+seed=config['run']['seed']
+print("seed",seed)
+if __name__ == "__main__":
+    setup_seeds(seed)
+    tvqa_eval=TVQAEVALRetrieval(args)
+    tvqa_eval.eval_tv_shows()

filter_json.py ADDED Viewed

	@@ -0,0 +1,63 @@

+import json
+import os
+# === 配置路径 ===
+# 视频所在的文件夹
+video_dir = 'datasets/stage3/videos'
+# 原始 JSON 文件路径
+json_path = 'datasets/stage3/video_instruct_data.json'
+def filter_data():
+    print(f"正在扫描视频文件夹: {video_dir} ...")
+    if not os.path.exists(video_dir):
+        print(f"错误: 找不到视频文件夹 {video_dir}")
+        return
+    # 1. 获取所有存在的视频 ID (去掉文件名后缀，比如 .mp4)
+    existing_video_ids = set()
+    files = os.listdir(video_dir)
+    for f in files:
+        # 跳过隐藏文件
+        if f.startswith('.'):
+            continue
+        # 获取文件名作为 ID (例如 v_xyz.mp4 -> v_xyz)
+        vid_id = os.path.splitext(f)[0]
+        existing_video_ids.add(vid_id)
+    print(f"找到 {len(existing_video_ids)} 个视频文件。")
+    # 2. 读取原始 JSON
+    print(f"正在读取 JSON: {json_path} ...")
+    if not os.path.exists(json_path):
+        print(f"错误: 找不到 JSON 文件 {json_path}")
+        return
+    with open(json_path, 'r', encoding='utf-8') as f:
+        data = json.load(f)
+    original_count = len(data)
+    print(f"原始 JSON 包含 {original_count} 条数据。")
+    # 3. 进行过滤
+    filtered_data = []
+    for item in data:
+        # 获取 JSON 里的 video_id
+        vid = item.get('video_id')
+        # 检查是否在刚才扫描的集合里
+        if vid in existing_video_ids:
+            filtered_data.append(item)
+    filtered_count = len(filtered_data)
+    print(f"过滤后剩余 {filtered_count} 条数据 (剔除了 {original_count - filtered_count} 条)。")
+    # 4. 覆盖保存
+    if filtered_count > 0:
+        with open(json_path, 'w', encoding='utf-8') as f:
+            json.dump(filtered_data, f, indent=4)
+        print("✅ JSON 文件已更新！现在可以开始训练了。")
+    else:
+        print("⚠️ 警告: 过滤后数据为空！请检查视频文件夹路径是否正确，或视频文件名是否与 JSON 中的 ID 匹配。")
+if __name__ == "__main__":
+    filter_data()

goldfish_demo.py ADDED Viewed

	@@ -0,0 +1,198 @@

+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+import spaces
+import os
+import argparse
+import gradio as gr
+from goldfish_lv import GoldFish_LV
+from theme import minigptlv_style, custom_css,text_css
+import re
+from huggingface_hub import login, hf_hub_download
+import time
+import moviepy.editor as mp
+from index import MemoryIndex
+# hf_token = os.environ.get('HF_TKN')
+# login(token=hf_token)
+def str2bool(v):
+    if isinstance(v, bool):
+        return v
+    if v.lower() in ('yes', 'true', 't', 'y', '1'):
+        return True
+    elif v.lower() in ('no', 'false', 'f', 'n', '0'):
+        return False
+    else:
+        raise argparse.ArgumentTypeError('Boolean value expected.')
+def get_arguments():
+    parser = argparse.ArgumentParser(description="Inference parameters")
+    parser.add_argument("--cfg-path", default="test_configs/llama2_test_config.yaml")
+    parser.add_argument("--name", type=str, default='test')
+    parser.add_argument("--ckpt", type=str, default="checkpoints/video_llama_checkpoint_last.pth")
+    parser.add_argument("--add_subtitles", action='store_true')
+    parser.add_argument("--neighbours", type=int, default=3)
+    parser.add_argument("--eval_opt", type=str, default='all')
+    parser.add_argument("--max_new_tokens", type=int, default=512)
+    parser.add_argument("--use_openai_embedding",type=str2bool, default=False)
+    parser.add_argument("--batch_size", type=int, default=2, help="Batch size for short video clips")
+    parser.add_argument("--lora_r", type=int, default=64)
+    parser.add_argument("--lora_alpha", type=int, default=16)
+    parser.add_argument("--video_path", type=str, help="Path to the video file")
+    parser.add_argument("--options", nargs="+")
+    return parser.parse_args()
+def download_video(youtube_url, download_finish):
+    if is_youtube_url(youtube_url):
+        processed_video_path = goldfish_obj.process_video_url(youtube_url)
+        download_finish = gr.State(value=True)
+        return processed_video_path, download_finish
+    else:
+        return None, download_finish
+def is_youtube_url(url: str) -> bool:
+    youtube_regex = (
+        r'(https?://)?(www\.)?'
+        '(youtube|youtu|youtube-nocookie)\.(com|be)/'
+        '(watch\?v=|embed/|v/|.+\?v=)?([^&=%\?]{11})'
+    )
+    return bool(re.match(youtube_regex, url))
+@spaces.GPU(duration=60*5)
+def gradio_long_inference_video(videos_list,tmp_save_path,subtitle_paths, use_subtitles=True):
+    clips_summary = goldfish_obj.long_inference_video(videos_list,tmp_save_path,subtitle_paths)
+    return clips_summary
+@spaces.GPU(duration=60*3)
+def gradio_short_inference_video(video_path, instruction, use_subtitles=True):
+    pred = goldfish_obj.short_video_inference(video_path, instruction, use_subtitles)
+    return pred
+@spaces.GPU(duration=60*3)
+def gradio_inference_RAG (instruction,related_information):
+    pred=goldfish_obj.inference_RAG([instruction], [related_information])[0]
+    return pred
+def inference(video_path, use_subtitles=True, instruction="", number_of_neighbours=3):
+    start_time = time.time()
+    video_name = os.path.splitext(os.path.basename(video_path))[0]
+    goldfish_obj.args.neighbours = number_of_neighbours
+    print(f"Video name: {video_name}")
+    video_duration = mp.VideoFileClip(video_path).duration
+    print(f"Video duration: {video_duration:.2f} seconds")
+    # if the video duration is more than 2 minutes we need to run the long inference
+    if video_duration > 180 :
+        print("Long video")
+        # if the video data is already stored in the external memory, we can use it directly else we need to run the long inference
+        file_path=f'new_workspace/clips_summary/demo/{video_name}.json'
+        if not os.path.exists(file_path):
+            print("Clips summary is not ready")
+            videos_list,tmp_save_path=goldfish_obj.split_long_video_into_clips(video_path)
+            subtitle_paths = []
+            for video_p in videos_list:
+                clip_path = os.path.join(tmp_save_path, video_p)
+                subtitle_path = goldfish_obj.get_subtitles(clip_path) if use_subtitles else None
+                subtitle_paths.append(subtitle_path)
+            gradio_long_inference_video(videos_list,tmp_save_path,subtitle_paths, use_subtitles=use_subtitles)
+        else:
+            print("External memory is ready")
+        os.makedirs("new_workspace/embedding/demo", exist_ok=True)
+        os.makedirs("new_workspace/open_ai_embedding/demo", exist_ok=True)
+        if goldfish_obj.args.use_openai_embedding:
+            embedding_path=f"new_workspace/open_ai_embedding/demo/{video_name}.pkl"
+        else:
+            embedding_path=f"new_workspace/embedding/demo/{video_name}.pkl"
+        external_memory=MemoryIndex(goldfish_obj.args.neighbours,use_openai=goldfish_obj.args.use_openai_embedding)
+        if os.path.exists(embedding_path):
+            print("Loading embeddings from pkl file")
+            external_memory.load_embeddings_from_pkl(embedding_path)
+        else:
+            # will embed the information and save it in the pkl file
+            external_memory.load_documents_from_json(file_path,embedding_path)
+        # get the most similar context from the external memory to this instruction
+        related_context_documents,related_context_keys = external_memory.search_by_similarity(instruction)
+        related_information=goldfish_obj.get_related_context(external_memory,related_context_keys)
+        pred=gradio_inference_RAG(instruction,related_information)
+        # remove stored data
+        # os.remove(file_path)
+        # os.system(f"rm -r workspace/tmp/{self.video_name}")
+        # os.system(f"rm -r workspace/subtitles/{self.video_name}")
+        # os.system(f"rm workspace/tmp/{self.video_id}.mp4")
+    else:
+        print("Short video")
+        goldfish_obj.video_name=video_path.split('/')[-1].split('.')[0]
+        pred=gradio_short_inference_video(video_path,instruction,use_subtitles)
+    processing_time = time.time() - start_time
+    print(f"Processing time: {processing_time:.2f} seconds")
+    return pred
+def process_video(path_url, has_subtitles, instruction, number_of_neighbours):
+    if is_youtube_url(path_url):
+        video_path = return_video_path(path_url)
+    else:
+        video_path = path_url
+    pred = inference(video_path, has_subtitles, instruction, number_of_neighbours)
+    return pred
+def return_video_path(youtube_url):
+    video_id = youtube_url.split("https://www.youtube.com/watch?v=")[-1].split('&')[0]
+    if video_id:
+        return os.path.join("workspace", "tmp", f"{video_id}.mp4")
+    else:
+        raise ValueError("Invalid YouTube URL provided.")
+def run_gradio():
+    title = """<h1 align="center">Goldfish Demo </h1>"""
+    description = """<h5>[ECCV 2024 Accepted]Goldfish: Vision-Language Understanding of Arbitrarily Long Videos</h5>"""
+    project_page = """<p><a href='https://vision-cair.github.io/MiniGPT4-video/'><img src='https://img.shields.io/badge/Project-Page-Green'></a></p>"""
+    code_link="""<p><a href='https://github.com/Vision-CAIR/MiniGPT4-video'><img src='repo_imgs/goldfishai_png.png'></a></p>"""
+    paper_link="""<p><a href=''><img src='https://img.shields.io/badge/Paper-PDF-red'></a></p>"""
+    with gr.Blocks(title="Goldfish demo",css=text_css ) as demo :
+        gr.Markdown(title)
+        gr.Markdown(description)
+        with gr.Tab("Youtube videos") as youtube_tab:
+            with gr.Row():
+                with gr.Column():
+                    youtube_link = gr.Textbox(label="YouTube link", placeholder="Paste YouTube URL here")
+                    video_player = gr.Video(autoplay=False)
+                    download_finish = gr.State(value=False)
+                    youtube_link.change(
+                        fn=download_video,
+                        inputs=[youtube_link, download_finish],
+                        outputs=[video_player, download_finish]
+                    )
+            with gr.Row():
+                with gr.Column(scale=2) :
+                    youtube_question = gr.Textbox(label="Your Question", placeholder="Default: What's this video talking about?")
+                    youtube_has_subtitles = gr.Checkbox(label="Use subtitles", value=True)
+                    youtube_input_note = """<p>For the global questions set the number of neighbours=-1 otherwise use 3 as the defualt.</p>"""
+                    gr.Markdown(youtube_input_note)
+                    # input number
+                    youtube_number_of_neighbours=gr.Number(label="Number of Neighbours",interactive=True,value=3)
+                    youtube_process_button = gr.Button("⛓️ Answer the Question (QA)")
+                with gr.Column(scale=3):
+                    youtube_answer = gr.Textbox(label="Answer of the question", lines=8, interactive=True, placeholder="Answer of the question will show up here.")
+                youtube_process_button.click(fn=process_video, inputs=[youtube_link, youtube_has_subtitles, youtube_question,youtube_number_of_neighbours], outputs=[youtube_answer])
+        with gr.Tab("Local videos") as local_tab:
+            with gr.Row():
+                with gr.Column():
+                    local_video_player = gr.Video(sources=["upload"])
+            with gr.Row():
+                with gr.Column(scale=2):
+                    local_question = gr.Textbox(label="Your Question", placeholder="Default: What's this video talking about?")
+                    local_has_subtitles = gr.Checkbox(label="Use subtitles", value=True)
+                    local_input_note = """<p>For the global questions set the number of neighbours=-1 otherwise use 3 as the defualt.</p>"""
+                    gr.Markdown(local_input_note)
+                    local_number_of_neighbours=gr.Number(label="Number of Neighbours",interactive=True,value=3)
+                    local_process_button = gr.Button("⛓️ Answer the Question (QA)")
+                with gr.Column(scale=3):
+                    local_answer = gr.Textbox(label="Answer of the question", lines=8, interactive=True, placeholder="Answer of the question will show up here.")
+                local_process_button.click(fn=process_video, inputs=[local_video_player, local_has_subtitles, local_question,local_number_of_neighbours], outputs=[local_answer])
+    demo.queue(max_size=10).launch(show_error=True,share=True, show_api=False,server_port=5000)
+if __name__ == "__main__":
+    args=get_arguments()
+    goldfish_obj = GoldFish_LV(args)
+    run_gradio()

goldfish_inference.py ADDED Viewed

	@@ -0,0 +1,62 @@

+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+import os
+import argparse
+import gradio as gr
+from goldfish_lv import GoldFish_LV
+from theme import minigptlv_style
+import time
+def str2bool(v):
+    if isinstance(v, bool):
+        return v
+    if v.lower() in ('yes', 'true', 't', 'y', '1'):
+        return True
+    elif v.lower() in ('no', 'false', 'f', 'n', '0'):
+        return False
+    else:
+        raise argparse.ArgumentTypeError('Boolean value expected.')
+def get_arguments():
+    parser = argparse.ArgumentParser(description="Inference parameters")
+    parser.add_argument("--cfg-path", default="test_configs/llama2_test_config.yaml")
+    parser.add_argument("--neighbours", type=int, default=3)
+    parser.add_argument("--ckpt", type=str, default="checkpoints/video_llama_checkpoint_last.pth")
+    parser.add_argument("--add_subtitles", action='store_true')
+    parser.add_argument("--max_new_tokens", type=int, default=512)
+    parser.add_argument("--use_openai_embedding",type=str2bool, default=False)
+    parser.add_argument("--batch_size", type=int, default=2, help="Batch size for short video clips")
+    parser.add_argument("--lora_r", type=int, default=64)
+    parser.add_argument("--lora_alpha", type=int, default=16)
+    parser.add_argument("--video_path", type=str,default="path for video.mp4", help="Path to the video file or youtube url")
+    parser.add_argument("--question", type=str, default="Why rachel is wearing a wedding dress?")
+    parser.add_argument("--options", nargs="+")
+    return parser.parse_args()
+def download_video(youtube_url):
+    processed_video_path = goldfish_lv.process_video_url(youtube_url)
+    return processed_video_path
+def process_video(video_path, has_subtitles, instruction="",number_of_neighbours=-1):
+    result = goldfish_lv.inference(video_path, has_subtitles, instruction,number_of_neighbours)
+    pred = result["pred"]
+    return pred
+def return_video_path(youtube_url):
+    video_id = youtube_url.split("https://www.youtube.com/watch?v=")[-1].split('&')[0]
+    if video_id:
+        return os.path.join("workspace", "tmp", f"{video_id}.mp4")
+    else:
+        raise ValueError("Invalid YouTube URL provided.")
+args=get_arguments()
+if __name__ == "__main__":
+    t1=time.time()
+    print("using openai: ", args.use_openai_embedding)
+    goldfish_lv = GoldFish_LV(args)
+    t2=time.time()
+    print("Time taken to load model: ", t2-t1)
+    processed_video_path = goldfish_lv.process_video_url(args.video_path)
+    pred=process_video(processed_video_path, args.add_subtitles, args.question,args.neighbours)
+    print("Question answer: ", pred)
+    print(f"Time taken for inference: ", time.time()-t2)

goldfish_lv.py ADDED Viewed

	@@ -0,0 +1,654 @@

+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+import os
+import time
+import json
+import argparse
+import torch
+import cv2
+import moviepy.editor as mp
+import webvtt
+import re
+from typing import Optional, List
+from tqdm import tqdm
+from PIL import Image
+from torchvision import transforms
+from pytubefix import YouTube
+from minigpt4.common.eval_utils import init_model
+from minigpt4.conversation.conversation import CONV_VISION
+from index import MemoryIndex
+import pysrt
+import chardet
+from openai import OpenAI
+if os.getenv("OPENAI_API_KEY") is not None:
+    client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
+else:
+    client = OpenAI(api_key="")
+from transformers import AutoTokenizer, AutoModelForCausalLM
+import re
+from transformers import BitsAndBytesConfig
+# from split_long_video_in_parallel import split_video
+import transformers
+import whisper
+from datetime import timedelta
+# Function to format timestamps for VTT
+def format_timestamp(seconds):
+    td = timedelta(seconds=seconds)
+    total_seconds = int(td.total_seconds())
+    milliseconds = int(td.microseconds / 1000)
+    hours, remainder = divmod(total_seconds, 3600)
+    minutes, seconds = divmod(remainder, 60)
+    return f"{hours:02}:{minutes:02}:{seconds:02}.{milliseconds:03}"
+def clean_text(subtitles_text):
+    # Remove unwanted characters except for letters, digits, spaces, periods, commas, exclamation marks, and single quotes
+    subtitles_text = re.sub(r'[^a-zA-Z0-9\s\']', '', subtitles_text)
+    # Replace multiple spaces with a single space
+    subtitles_text = re.sub(r'\s+', ' ', subtitles_text)
+    return subtitles_text.strip()
+def time_to_seconds(subrip_time):
+    return subrip_time.hours * 3600 + subrip_time.minutes * 60 + subrip_time.seconds + subrip_time.milliseconds / 1000
+def split_subtitles(subtitle_path, n):
+    # read the subtitle file and detect the encoding
+    with open(subtitle_path, 'rb') as f:
+        result = chardet.detect(f.read())
+    subs = pysrt.open(subtitle_path, encoding=result['encoding'])
+    total_subs = len(subs)
+    if n <= 0 or n > total_subs:
+        print("Invalid value for n. It should be a positive integer less than or equal to the total number of subtitles.")
+        return None
+    subs_per_paragraph = total_subs // n
+    remainder = total_subs % n
+    paragraphs = []
+    current_index = 0
+    for i in range(n):
+        num_subs_in_paragraph = subs_per_paragraph + (1 if i < remainder else 0)
+        paragraph_subs = subs[current_index:current_index + num_subs_in_paragraph]
+        current_index += num_subs_in_paragraph
+        # Join subtitles using pysrt's built-in method for efficient formatting
+        paragraph = pysrt.SubRipFile(items=paragraph_subs).text
+        paragraphs.append(paragraph)
+    return paragraphs
+class GoldFish_LV:
+    """
+    'GoldFish_LV' class is to handle long video processing and subtitle management with MiniGPT4_video base model.
+    """
+    def __init__(self, args: argparse.Namespace) -> None:
+        self.args = args
+        self.model, self.vis_processor,whisper_gpu_id,minigpt4_gpu_id,answer_module_gpu_id = init_model(args)
+        self.whisper_gpu_id=whisper_gpu_id
+        self.minigpt4_gpu_id=minigpt4_gpu_id
+        self.answer_module_gpu_id=answer_module_gpu_id
+        # self.original_llama_model,self.original_llama_tokenizer=self.load_original_llama_model()
+        # self.original_llama_model=self.load_original_llama_model_vllm()
+        self.llama_3_1_model=self.load_llama3_1_model()
+        self.whisper_model=whisper.load_model("large",device=f"cuda:{self.whisper_gpu_id}")
+        # self.summary_instruction="Generate a description of this video .Pay close attention to the objects, actions, emotions portrayed in the video,providing a vivid description of key moments.Specify any visual cues or elements that stand out."
+        self.summary_instruction="I'm a blind person, please provide me with a detailed summary of the video content and try to be as descriptive as possible."
+    def load_original_llama_model(self):
+        model_name="meta-llama/Meta-Llama-3-8B-Instruct"
+        tokenizer = AutoTokenizer.from_pretrained(model_name)
+        tokenizer.pad_token = "[PAD]"
+        tokenizer.padding_side = "left"
+        bnb_config = BitsAndBytesConfig(
+                    load_in_8bit=True,
+                )
+        llama_model = AutoModelForCausalLM.from_pretrained(
+                model_name,
+                torch_dtype=torch.bfloat16,
+                device_map={'': f"cuda:{self.answer_module_gpu_id}"},
+                quantization_config=bnb_config,
+            )
+        return llama_model,tokenizer
+    def load_llama3_1_model(self):
+        model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"
+        bnb_config = BitsAndBytesConfig(
+                    load_in_8bit=True,
+                )
+        self.llama3_tokenizer = AutoTokenizer.from_pretrained(model_id)
+        llama3_model = AutoModelForCausalLM.from_pretrained(
+                model_id,
+                torch_dtype=torch.bfloat16,
+                device_map={'': f"cuda:{self.answer_module_gpu_id}"},
+                quantization_config=bnb_config,
+            )
+        pipeline = transformers.pipeline(
+            "text-generation",
+            model=llama3_model,
+            tokenizer=self.llama3_tokenizer,
+            model_kwargs={"torch_dtype": torch.bfloat16},
+            device_map=f"cuda:{self.answer_module_gpu_id}",
+        )
+        return pipeline
+    def _youtube_download(self, url: str) -> str:
+        try:
+            video_id = url.split('v=')[-1].split('&')[0]
+            video_id = video_id.strip()
+            print(f"Downloading video with ID: {video_id}")
+            youtube = YouTube(f"https://www.youtube.com/watch?v={video_id}")
+            video_stream = youtube.streams.filter(progressive=True, file_extension='mp4').order_by('resolution').desc().first()
+            if not video_stream:
+                raise ValueError("No suitable video stream found.")
+            output_path = f"workspace/tmp/{video_id}.mp4"
+            self.video_id=video_id
+            video_stream.download(output_path="workspace/tmp", filename=f"{video_id}.mp4")
+            return output_path
+        except Exception as e:
+            print(f"Error downloading video: {e}")
+            return url
+    @staticmethod
+    def is_youtube_url(url: str) -> bool:
+        youtube_regex = (
+            r'(https?://)?(www\.)?'
+            '(youtube|youtu|youtube-nocookie)\.(com|be)/'
+            '(watch\?v=|embed/|v/|.+\?v=)?([^&=%\?]{11})'
+        )
+        return bool(re.match(youtube_regex, url))
+    def process_video_url(self, video_path: str) -> str:
+        if self.is_youtube_url(video_path):
+            return self._youtube_download(video_path)
+        else:
+            return video_path
+    def create_video_grid(self, images: list, rows: int, cols: int, save_path: str) -> Image.Image:
+        image_width, image_height = images[0].size
+        grid_width = cols * image_width
+        grid_height = rows * image_height
+        new_image = Image.new("RGB", (grid_width, grid_height))
+        for i in range(rows):
+            for j in range(cols):
+                index = i * cols + j
+                if index < len(images):
+                    image = images[index]
+                    x_offset = j * image_width
+                    y_offset = i * image_height
+                    new_image.paste(image, (x_offset, y_offset))
+        new_image.save(save_path)
+        return new_image
+    def get_subtitles(self, video_path) :
+        video_name=video_path.split('/')[-2]
+        video_id=video_path.split('/')[-1].split('.')[0]
+        audio_dir = f"workspace/audio/{video_name}"
+        subtitle_dir = f"workspace/subtitles/{video_name}"
+        os.makedirs(audio_dir, exist_ok=True)
+        os.makedirs(subtitle_dir, exist_ok=True)
+        # if the subtitles are already generated, return the path of the subtitles
+        subtitle_path = f"{subtitle_dir}/{video_id}"+'.vtt'
+        if os.path.exists(subtitle_path):
+            return f"{subtitle_dir}/{video_id}"+'.vtt'
+        audio_path = f"{audio_dir}/{video_id}"+'.mp3'
+        try:
+            self.extract_audio(video_path, audio_path)
+            subtitle_path = f"{subtitle_dir}/{video_id}"+'.vtt'
+            result = self.whisper_model.transcribe(audio_path,language="en")
+            # Create VTT file
+            with open(subtitle_path, "w", encoding="utf-8") as vtt_file:
+                vtt_file.write("WEBVTT\n\n")
+                for segment in result['segments']:
+                    start = format_timestamp(segment['start'])
+                    end = format_timestamp(segment['end'])
+                    text = segment['text']
+                    vtt_file.write(f"{start} --> {end}\n{text}\n\n")
+            return subtitle_path
+        except Exception as e:
+            print(f"Error during subtitle generation for {video_path}: {e}")
+            return None
+    def prepare_input(self,
+                    video_path: str,
+                    subtitle_path: Optional[str],
+                    instruction: str,previous_caption=""):
+        # If a subtitle path is provided, read the VTT (Web Video Text Tracks) file, else set to an empty list
+        conversation=""
+        if subtitle_path:
+            vtt_file = webvtt.read(subtitle_path)
+            print("Subtitle loaded successfully")
+            try:
+                for subtitle in vtt_file:
+                    sub = subtitle.text.replace('\n',' ')
+                    conversation+=sub
+            except:
+                pass
+        if self.model.model_type == "Mistral":
+            max_images_length=90
+            max_sub_len = 800
+        else:
+            max_images_length = 45
+            max_sub_len = 400
+        # Load the video file using moviepy and calculate the total number of frames
+        clip = mp.VideoFileClip(video_path)
+        total_num_frames = int(clip.duration * clip.fps)
+        clip.close()
+        # Calculate how often to sample a frame based on the total number of frames and the maximum images length
+        cap = cv2.VideoCapture(video_path)
+        images = []
+        frame_count = 0
+        sampling_interval = int(total_num_frames / max_images_length)
+        if sampling_interval == 0:
+            sampling_interval = 1
+        # Initialize variables to hold image placeholders, current subtitle text, and subtitle history
+        if previous_caption != "":
+            img_placeholder = previous_caption+" "
+        else:
+            img_placeholder = ""
+        subtitle_text_in_interval = ""
+        history_subtitles = {}
+        raw_frames=[]
+        number_of_words=0
+        transform=transforms.Compose([
+                    transforms.ToPILImage(),
+                ])
+        # Loop through each frame in the video
+        while cap.isOpened():
+            ret, frame = cap.read()
+            if not ret:
+                break
+            # TODO: we need to add subtitles in external memory either
+            if subtitle_path is not None:
+                for i, subtitle in enumerate(vtt_file):
+                    sub = subtitle.text.replace('\n',' ')
+                    if (subtitle.start_in_seconds <= (frame_count / int(clip.fps)) <= subtitle.end_in_seconds) and sub not in subtitle_text_in_interval:
+                        if not history_subtitles.get(sub, False):
+                            subtitle_text_in_interval += sub + " "
+                        history_subtitles[sub] = True
+                        break
+            # Process and store the frame at specified intervals
+            if frame_count % sampling_interval == 0:
+                raw_frames.append(Image.fromarray(cv2.cvtColor(frame.copy(), cv2.COLOR_BGR2RGB)))
+                frame = transform(frame[:,:,::-1]) # convert to RGB
+                frame = self.vis_processor(frame)
+                images.append(frame)
+                img_placeholder += '<Img><ImageHere>'
+                if subtitle_path is not None and subtitle_text_in_interval != "" and number_of_words< max_sub_len:
+                    img_placeholder+=f'<Cap>{subtitle_text_in_interval}'
+                    number_of_words+=len(subtitle_text_in_interval.split(' '))
+                    subtitle_text_in_interval = ""
+            frame_count += 1
+            # Break the loop if the maximum number of images is reached
+            if len(images) >= max_images_length:
+                break
+        cap.release()
+        cv2.destroyAllWindows()
+        # Return None if no images are extracted
+        if len(images) == 0:
+            return None, None
+        while len(images) < max_images_length:
+            images.append(images[-1])
+            img_placeholder += '<Img><ImageHere>'
+        images = torch.stack(images)
+        print("Input instruction length",len(instruction.split(' ')))
+        instruction = img_placeholder + '\n' + instruction
+        print("number of words",number_of_words)
+        print("number of images",len(images))
+        return images, instruction,conversation
+    def extract_audio(self, video_path: str, audio_path: str) -> None:
+        video_clip = mp.VideoFileClip(video_path)
+        audio_clip = video_clip.audio
+        audio_clip.write_audiofile(audio_path, codec="libmp3lame", bitrate="320k")
+    def short_video_inference (self,video_path,instruction,gen_subtitles=True):
+        if gen_subtitles:
+            subtitle_path=self.get_subtitles(video_path)
+        else :
+            subtitle_path=None
+        prepared_images,prepared_instruction,video_conversation=self.prepare_input(video_path,subtitle_path,instruction)
+        if prepared_images is None:
+            return "Video cann't be open ,check the video path again"
+        length=len(prepared_images)
+        prepared_images=prepared_images.unsqueeze(0)
+        conv = CONV_VISION.copy()
+        conv.system = ""
+        # if you want to make conversation comment the 2 lines above and make the conv is global variable
+        conv.append_message(conv.roles[0], prepared_instruction)
+        conv.append_message(conv.roles[1], None)
+        prompt = [conv.get_prompt()]
+        answers = self.model.generate(prepared_images, prompt, max_new_tokens=512, do_sample=False, lengths=[length],num_beams=1)
+        return answers[0]
+    def split_long_video_into_clips(self,video_path):
+        # Split the video into 90 seconds clips and make a queue of the videos and run the inference on each video
+        self.video_name=video_path.split('/')[-1].split('.')[0]
+        tmp_save_path=f"workspace/tmp/{self.video_name}"
+        os.makedirs(tmp_save_path, exist_ok=True)
+        print("tmp_save_path",tmp_save_path)
+        if len(os.listdir(tmp_save_path)) == 0:
+            print("Splitting Long video")
+            os.system(f"python split_long_video_in_parallel.py --video_path {video_path} --output_folder {tmp_save_path}")
+            # split_video(video_path, tmp_save_path, clip_duration=90)
+        videos_list = sorted(os.listdir(tmp_save_path))
+        return videos_list,tmp_save_path
+    def long_inference_video(self, videos_list,tmp_save_path,subtitle_paths) -> Optional[str]:
+        save_long_videos_path = "new_workspace/clips_summary/demo"
+        os.makedirs(save_long_videos_path, exist_ok=True)
+        file_path = f'{save_long_videos_path}/{self.video_name}.json'
+        if os.path.exists(file_path):
+            print("Clips inference already done")
+            with open(file_path, 'r') as file:
+                video_information = json.load(file)
+        else:
+            video_number = 0
+            batch_size = self.args.batch_size
+            batch_video_paths, batch_instructions ,batch_subtitles= [], [],[]
+            video_information = {}
+            video_captions = []
+            for i, video in tqdm(enumerate(videos_list), desc="Inference video clips", total=len(videos_list)):
+                clip_path = os.path.join(tmp_save_path, video)
+                batch_video_paths.append(clip_path)
+                # previous_caption =  "You are analysing a one long video of mutiple clips and this is the summary from all previous clips :"+video_captions[-1]+"\n\n" if video_captions else ""
+                previous_caption=""
+                batch_instructions.append(self.summary_instruction)
+                batch_subtitles.append(subtitle_paths[i])
+                # Process each batch
+                if len(batch_video_paths) % batch_size == 0 and i != 0:
+                    batch_preds,videos_conversation=self.run_batch(batch_video_paths,batch_instructions, batch_subtitles,previous_caption)
+                    for pred,subtitle in zip(batch_preds,videos_conversation):
+                        video_number += 1
+                        save_name=f"{video_number}".zfill(5)
+                        if pred != "":
+                            video_information[f'caption__{save_name}'] = pred
+                        if subtitle != "":
+                            video_information[f'subtitle__{save_name}'] = subtitle
+                        video_captions.append(pred)
+                    batch_video_paths, batch_instructions,batch_subtitles = [], [],[]
+            # Process any remaining videos in the last batch
+            if batch_video_paths:
+                batch_preds,videos_conversation=self.run_batch(batch_video_paths,batch_instructions, batch_subtitles,previous_caption)
+                for pred,subtitle in zip(batch_preds,videos_conversation):
+                    video_number += 1
+                    save_name=f"{video_number}".zfill(5)
+                    if pred != "":
+                            video_information[f'caption__{save_name}'] = pred
+                    if subtitle != "":
+                        video_information[f'subtitle__{save_name}'] = subtitle
+                    video_captions.append(pred)
+            with open(file_path, 'w') as file:
+                json.dump(video_information, file, indent=4)
+            print("Clips inference done")
+        return video_information
+    # def inference_RAG(self, instructions, context_list):
+    #     context_promots=[]
+    #     questions_prompts=[]
+    #     try:
+    #         for instruction,context in zip(instructions,context_list):
+    #             context=clean_text(context)
+    #             context_prompt=f"<s>[INST] Your task is to answer questions for one long video which is split into multiple clips.\nGiven these related information from the most related clips: \n{context}\n"
+    #             question_prompt=f"\nAnswer this question :{instruction} \n your answer is: [/INST]"
+    #             context_promots.append(context_prompt)
+    #             questions_prompts.append(question_prompt)
+    #         context_inputs = self.original_llama_tokenizer(context_promots, return_tensors="pt", padding=True, truncation=True,max_length=3500)
+    #         # print(context_inputs.keys())
+    #         print("context_inputs shape",context_inputs['input_ids'].shape)
+    #         question_inputs = self.original_llama_tokenizer(questions_prompts, return_tensors="pt", padding=True, truncation=True,max_length=300)
+    #         print("question_inputs shape",question_inputs['input_ids'].shape)
+    #         # concate the context and the question together
+    #         inputs_ids=torch.cat((context_inputs['input_ids'],question_inputs['input_ids']),dim=1).to('cuda')
+    #         print("inputs shape",inputs_ids.shape)
+    #     except Exception as e:
+    #         print("error while tokenization",e)
+    #         return self.inference_RAG_batch_size_1(instructions, context_list)
+    #     with torch.no_grad():
+    #         summary_ids = self.original_llama_model.generate(inputs_ids,max_new_tokens=512)
+    #     answers=[]
+    #     for i in range(len(summary_ids)):
+    #         output_text=self.original_llama_tokenizer.decode(summary_ids[i], skip_special_tokens=True)
+    #         output_text = output_text.split('</s>')[0]  # remove the stop sign </s>
+    #         output_text = output_text.replace("<s>", "")
+    #         output_text = output_text.split(r'[/INST]')[-1].strip()
+    #         answers.append(output_text)
+    #     return answers
+    def inference_RAG(self, instructions, context_list):
+        messages=[]
+        for instruction,context in zip(instructions,context_list):
+            context=clean_text(context)
+            context_prompt=f"Your task is to answer a specific question based on one long video. While you cannot view the video yourself, I will supply you with the most relevant text information from the most pertinent clips. \n{context}\n"
+            question_prompt=f"\nPlease provide a detailed and accurate answer to the following question:{instruction} \n Your answer should be:"
+            # limit the context words to 10000 word duo to hardware limitation
+            context_words=context_prompt.split(' ')
+            truncated_context=' '.join(context_words[:10000])
+            print("Number of words",len((truncated_context+question_prompt).split(' ')))
+            messages.append([{"role": "user", "content": truncated_context+question_prompt}])
+        outputs=self.llama_3_1_model(messages, max_new_tokens=512)
+        answers=[]
+        for out in outputs:
+            answers.append(out[0]["generated_text"][-1]['content'])
+        return answers
+    # def inference_RAG(self, instructions, context_list):
+    #     prompts=[]
+    #     for instruction,context in zip(instructions,context_list):
+    #         context=clean_text(context)
+    #         context_prompt=f"Your task is to answer questions for one long video which is split into multiple clips.\nGiven these related information from the most related clips: \n{context}\n"
+    #         question_prompt=f"\nAnswer this question :{instruction} \n your answer is:"
+    #         prompts.append(context_prompt+question_prompt)
+    #     with open('prompts.txt','w') as f:
+    #         for prompt in prompts:
+    #             f.write(prompt+'\n')
+    #     outputs=self.original_llama_model.generate(prompts)
+    #     answers=[]
+    #     for out in outputs:
+    #         answers.append(out.outputs[0].text)
+    #     return answers
+    def inference_RAG_batch_size_1(self, instructions, context_list):
+        answers=[]
+        for instruction,context in zip(instructions,context_list):
+            context=clean_text(context)
+            context_prompt=f"<s>[INST] Your task is to answer questions for one long video which is split into multiple clips.\nGiven these related information from the most related clips: \n{context}\n"
+            question_prompt=f"\nAnswer this question :{instruction} \n your answer is: [/INST]"
+            context_inputs=self.original_llama_tokenizer([context_prompt], return_tensors="pt", padding=True, truncation=True,max_length=3500)['input_ids']
+            question_inputs=self.original_llama_tokenizer([question_prompt], return_tensors="pt", padding=True, truncation=True,max_length=300)['input_ids']
+            inputs_ids=torch.cat((context_inputs,question_inputs),dim=1).to('cuda')
+            with torch.no_grad():
+                summary_ids = self.original_llama_model.generate(inputs_ids,max_new_tokens=512,)
+            output_text=self.original_llama_tokenizer.decode(summary_ids[0], skip_special_tokens=True)
+            output_text = output_text.split('</s>')[0]  # remove the stop sign </s>
+            output_text = output_text.replace("<s>", "")
+            output_text = output_text.split(r'[/INST]')[-1].strip()
+            answers.append(output_text)
+        return answers
+    # def inference_RAG_text_only(self, instructions, context_list):
+    #     # Use VideoLLM as the answer module
+    #     seg_tokens=[]
+    #     for instruction,context in zip(instructions,context_list):
+    #         context=clean_text(context)
+    #         context_prompt=f"<s>[INST] Your task is to answer questions for one long video which is split into multiple clips.\nGiven these related information from the most related clips: \n{context}\n"
+    #         question_prompt=f"\nAnswer this question :{instruction} \n your answer is: [/INST]"
+    #         context_inputs = self.model.llama_tokenizer(context_prompt,add_special_tokens=True, return_tensors="pt", padding=True, truncation=True,max_length=3500)
+    #         question_inputs = self.model.llama_tokenizer(question_prompt, return_tensors="pt", padding=True, truncation=True,max_length=300)
+    #         # concate the context and the question together
+    #         inputs_ids=torch.cat((context_inputs['input_ids'],question_inputs['input_ids']),dim=1).to('cuda')
+    #         seg_tokens.append(inputs_ids)
+    #     with torch.no_grad():
+    #         answers = self.model.generate_text_only(images=None,seg_tokens=seg_tokens,max_new_tokens=512)
+    #     return answers
+    def inference_RAG_chatGPT(self, instructions: str, context_list) -> str:
+        batch_preds=[]
+        for context,instruction in zip(context_list,instructions):
+            prompt="Your task is to answer questions for long video \n\n Given these related information from the most related clips: \n "+context +"\n\n" +"Answer this question: "+instruction
+            while True:
+                try:
+                    response = client.ChatCompletion.create(
+                        model="gpt-4o",
+                        messages=[
+                                {
+                                "role": "user",
+                                "content": prompt
+                                }],
+                    )
+                    answer=response.choices[0].message['content']
+                    batch_preds.append(answer)
+                    break
+                except Exception as e:
+                    print("chat gpt error",e)
+                    time.sleep(50)
+        return batch_preds
+    def get_most_related_clips(self,related_context_keys):
+        most_related_clips=set()
+        for context_key in related_context_keys:
+            if len(context_key.split('__'))>1:
+                most_related_clips.add(context_key.split('__')[1])
+            if len(most_related_clips)==self.args.neighbours:
+                break
+        assert len(most_related_clips)!=0, f"No related clips found {related_context_keys}"
+        return list(most_related_clips)
+    def get_related_context(self, external_memory,related_context_keys):
+        related_information=""
+        most_related_clips=self.get_most_related_clips(related_context_keys)
+        for clip_name in most_related_clips:
+            clip_conversation=""
+            general_sum=""
+            for key in external_memory.documents.keys():
+                if clip_name in key and 'caption' in key:
+                    general_sum="Clip Summary: "+external_memory.documents[key]
+                if clip_name in key and 'subtitle' in key:
+                    clip_conversation="Clip Subtitles: "+external_memory.documents[key]
+            related_information+=f"{general_sum},{clip_conversation}\n"
+        return related_information
+    def inference(self,video_path, use_subtitles=True, instruction="", number_of_neighbours=3):
+        start_time = time.time()
+        video_name = os.path.splitext(os.path.basename(video_path))[0]
+        self.args.neighbours = number_of_neighbours
+        print(f"Video name: {video_name}")
+        video_duration = mp.VideoFileClip(video_path).duration
+        print(f"Video duration: {video_duration:.2f} seconds")
+        # if the video duration is more than 2 minutes we need to run the long inference
+        if video_duration > 180 :
+            print("Long video")
+            # if the video data is already stored in the external memory, we can use it directly else we need to run the long inference
+            file_path=f'new_workspace/clips_summary/demo/{video_name}.json'
+            if not os.path.exists(file_path):
+                print("Clips summary is not ready")
+                videos_list,tmp_save_path=self.split_long_video_into_clips(video_path)
+                subtitle_paths = []
+                for video_p in videos_list:
+                    clip_path = os.path.join(tmp_save_path, video_p)
+                    subtitle_path = self.get_subtitles(clip_path) if use_subtitles else None
+                    subtitle_paths.append(subtitle_path)
+                clips_summary = self.long_inference_video(videos_list,tmp_save_path,subtitle_paths)
+            else:
+                print("External memory is ready")
+            os.makedirs("new_workspace/embedding/demo", exist_ok=True)
+            os.makedirs("new_workspace/open_ai_embedding/demo", exist_ok=True)
+            if self.args.use_openai_embedding:
+                embedding_path=f"new_workspace/open_ai_embedding/demo/{video_name}.pkl"
+            else:
+                embedding_path=f"new_workspace/embedding/demo/{video_name}.pkl"
+            external_memory=MemoryIndex(self.args.neighbours,use_openai=self.args.use_openai_embedding)
+            if os.path.exists(embedding_path):
+                print("Loading embeddings from pkl file")
+                external_memory.load_embeddings_from_pkl(embedding_path)
+            else:
+                # will embed the information and save it in the pkl file
+                external_memory.load_documents_from_json(file_path,embedding_path)
+            # get the most similar context from the external memory to this instruction
+            related_context_documents,related_context_keys = external_memory.search_by_similarity(instruction)
+            related_information=self.get_related_context(external_memory,related_context_keys)
+            pred=self.inference_RAG([instruction],[related_information])
+        else:
+            print("Short video")
+            self.video_name=video_path.split('/')[-1].split('.')[0]
+            pred=self.short_video_inference(video_path,instruction,use_subtitles)
+        processing_time = time.time() - start_time
+        print(f"Processing time: {processing_time:.2f} seconds")
+        return {
+            'video_name': os.path.splitext(os.path.basename(video_path))[0],
+            'pred': pred,
+        }
+    def run_batch(self, video_paths, instructions,subtitle_paths,previous_caption="") -> List[str]:
+        prepared_images_batch = []
+        prepared_instructions_batch = []
+        lengths_batch = []
+        videos_conversations=[]
+        for i,video_path, instruction in zip(range(len(video_paths)),video_paths, instructions):
+            subtitle_path = subtitle_paths[i]
+            prepared_images, prepared_instruction,video_conversation = self.prepare_input( video_path, subtitle_path, instruction,previous_caption)
+            if prepared_images is None:
+                print(f"Error: Unable to open video at {video_path}. Check the path and try again.")
+                continue
+            videos_conversations.append(video_conversation)
+            conversation = CONV_VISION.copy()
+            conversation.system = ""
+            conversation.append_message(conversation.roles[0], prepared_instruction)
+            conversation.append_message(conversation.roles[1], None)
+            prepared_instructions_batch.append(conversation.get_prompt())
+            prepared_images_batch.append(prepared_images)
+            lengths_batch.append(len(prepared_images))
+        if not prepared_images_batch:
+            return []
+        prepared_images_batch = torch.stack(prepared_images_batch)
+        answers=self.model.generate(prepared_images_batch, prepared_instructions_batch, max_new_tokens=self.args.max_new_tokens, do_sample=False, lengths=lengths_batch, num_beams=1)
+        return answers , videos_conversations
+    def run_images_features (self,img_embeds,prepared_instruction):
+        lengths=[]
+        prompts=[]
+        for i in range(img_embeds.shape[0]):
+            conv = CONV_VISION.copy()
+            conv.system = ""
+            conv.append_message(conv.roles[0], prepared_instruction[i])
+            conv.append_message(conv.roles[1], None)
+            prompts.append(conv.get_prompt())
+            lengths.append(len(img_embeds[i]))
+        answers = self.model.generate(images=None,img_embeds=img_embeds,texts=prompts, max_new_tokens=300, do_sample=False, lengths=lengths,num_beams=1)
+        return answers
+    def run_images (self,prepared_images,prepared_instruction):
+        lengths=[]
+        prompts=[]
+        for i in range(prepared_images.shape[0]):
+            conv = CONV_VISION.copy()
+            conv.system = ""
+            conv.append_message(conv.roles[0], prepared_instruction[i])
+            conv.append_message(conv.roles[1], None)
+            prompts.append(conv.get_prompt())
+            lengths.append(len(prepared_images[i]))
+        answers = self.model.generate(prepared_images, prompts, max_new_tokens=300, do_sample=False, lengths=lengths,num_beams=1)
+        return answers

index.py ADDED Viewed

	@@ -0,0 +1,103 @@

+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+import json
+import torch
+from sentence_transformers import SentenceTransformer
+from collections import defaultdict
+from typing import List, Dict, Tuple, Union
+import torch
+from PIL import Image
+import pickle
+from openai import OpenAI
+import os
+import torch
+import time
+import yaml
+class MemoryIndex:
+    def __init__(self,number_of_neighbours,use_openai=False):
+        self.documents = {}
+        self.document_vectors = {}
+        self.use_openai=use_openai
+        if use_openai:
+            api_key = os.getenv("OPENAI_API_KEY")
+            self.client = OpenAI(api_key=api_key)
+        self.model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')
+        # self.model = SentenceTransformer('sentence-transformers/paraphrase-MiniLM-L6-v2')
+        with open('test_configs/llama2_test_config.yaml') as file:
+            config = yaml.load(file, Loader=yaml.FullLoader)
+        embedding_gpu_id=config['model']['minigpt4_gpu_id']
+        self.device = f"cuda:{embedding_gpu_id}" if torch.cuda.is_available() else "cpu"
+        self.number_of_neighbours=int(number_of_neighbours)
+    def load_documents_from_json(self, file_path,emdedding_path=""):
+        with open(file_path, 'r') as file:
+            data = json.load(file)
+            for doc_id, doc_data in data.items():
+                self.documents[doc_id] = doc_data
+                self.document_vectors[doc_id] = self._compute_sentence_embedding(doc_data)
+        # save self.documents and self.document_vectors to pkl file
+        m=[self.documents,self.document_vectors]
+        with open(emdedding_path, 'wb') as file:
+            pickle.dump(m, file)
+        return emdedding_path
+    def load_embeddings_from_pkl(self, pkl_file_path):
+        #read the pkl file
+        with open(pkl_file_path, 'rb') as file:
+            data = pickle.load(file)
+            self.documents=data[0]
+            self.document_vectors=data[1]
+    def load_data_from_pkl(self, pkl_file_path):
+        with open(pkl_file_path, 'rb') as file:
+            data = pickle.load(file)
+            for doc_id, doc_data in data.items():
+                self.documents[doc_id] = doc_data
+                self.document_vectors[doc_id] = doc_data
+    def _compute_sentence_embedding(self, text: str) -> torch.Tensor:
+        if self.use_openai:
+            done=False
+            while not done:
+                try:
+                    embedding=self.client.embeddings.create(input = [text], model="text-embedding-3-small").data[0].embedding
+                    # Convert the list to a PyTorch tensor
+                    embedding = torch.tensor(embedding)
+                    done=True
+                except Exception as e:
+                    print("error",e)
+                    print("text",text)
+                    # sleep for 5 seconds and try again
+                    time.sleep(5)
+                    continue
+        else:
+            return self.model.encode(text, convert_to_tensor=True).to(self.device)
+        return embedding
+    def search_by_similarity(self, query: str) -> List[str]:
+        query_vector = self._compute_sentence_embedding(query)
+        scores = {doc_id: torch.nn.functional.cosine_similarity(query_vector, doc_vector, dim=0).item()
+                  for doc_id, doc_vector in self.document_vectors.items()}
+        sorted_doc_ids = sorted(scores, key=scores.get, reverse=True)
+        sorted_documents=[self.documents[doc_id] for doc_id in sorted_doc_ids]
+        if self.number_of_neighbours == -1:
+            return list(self.documents.values()), list(self.documents.keys())
+        if self.number_of_neighbours > len(sorted_documents):
+            return sorted_documents, sorted_doc_ids
+        # if the retrieved document is the summary, return the summary and the next document to grauntee that always retieve clip name.
+        if self.number_of_neighbours==1 and sorted_doc_ids[0]=='summary':
+            return sorted_documents[0:2], sorted_doc_ids[:2]
+        print("Number of neighbours",self.number_of_neighbours)
+        return sorted_documents[:self.number_of_neighbours], sorted_doc_ids[:self.number_of_neighbours]
+# # main function
+# if __name__ == "__main__":
+#     memory_index = MemoryIndex(-1,use_openai=True)
+#     memory_index.load_documents_from_json('workspace/results/llama_vid/tt0035423.json')
+#     print(memory_index.documents.keys())
+#     docs,keys=memory_index.search_by_similarity('kerolos')

minigpt4/__init__.py ADDED Viewed

	@@ -0,0 +1,31 @@

+"""
+ Copyright (c) 2022, salesforce.com, inc.
+ All rights reserved.
+ SPDX-License-Identifier: BSD-3-Clause
+ For full license text, see the LICENSE_Lavis file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+"""
+import os
+import sys
+from omegaconf import OmegaConf
+from minigpt4.common.registry import registry
+from minigpt4.datasets.builders import *
+from minigpt4.models import *
+from minigpt4.processors import *
+from minigpt4.tasks import *
+root_dir = os.path.dirname(os.path.abspath(__file__))
+default_cfg = OmegaConf.load(os.path.join(root_dir, "configs/default.yaml"))
+registry.register_path("library_root", root_dir)
+repo_root = os.path.join(root_dir, "..")
+registry.register_path("repo_root", repo_root)
+cache_root = os.path.join(repo_root, default_cfg.env.cache_root)
+registry.register_path("cache_root", cache_root)
+registry.register("MAX_INT", sys.maxsize)
+registry.register("SPLIT_NAMES", ["train", "val", "test"])

minigpt4/common/__init__.py ADDED Viewed

File without changes

minigpt4/common/config.py ADDED Viewed

	@@ -0,0 +1,474 @@

+"""
+ Copyright (c) 2022, salesforce.com, inc.
+ All rights reserved.
+ SPDX-License-Identifier: BSD-3-Clause
+ For full license text, see the LICENSE_Lavis file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+"""
+import logging
+import json
+from typing import Dict
+from omegaconf import OmegaConf
+from minigpt4.common.registry import registry
+class Config:
+    def __init__(self, args):
+        self.config = {}
+        self.args = args
+        # Register the config and configuration for setup
+        registry.register("configuration", self)
+        user_config = self._build_opt_list(self.args.options)
+        config = OmegaConf.load(self.args.cfg_path)
+        runner_config = self.build_runner_config(config)
+        model_config = self.build_model_config(config, **user_config)
+        dataset_config = self.build_dataset_config(config)
+        # Validate the user-provided runner configuration
+        # model and dataset configuration are supposed to be validated by the respective classes
+        # [TODO] validate the model/dataset configuration
+        # self._validate_runner_config(runner_config)
+        # Override the default configuration with user options.
+        self.config = OmegaConf.merge(
+            runner_config, model_config, dataset_config, user_config
+        )
+    def _validate_runner_config(self, runner_config):
+        """
+        This method validates the configuration, such that
+            1) all the user specified options are valid;
+            2) no type mismatches between the user specified options and the config.
+        """
+        runner_config_validator = create_runner_config_validator()
+        runner_config_validator.validate(runner_config)
+    def _build_opt_list(self, opts):
+        opts_dot_list = self._convert_to_dot_list(opts)
+        return OmegaConf.from_dotlist(opts_dot_list)
+    @staticmethod
+    def build_model_config(config, **kwargs):
+        model = config.get("model", None)
+        assert model is not None, "Missing model configuration file."
+        model_cls = registry.get_model_class(model.arch)
+        assert model_cls is not None, f"Model '{model.arch}' has not been registered."
+        model_type = kwargs.get("model.model_type", None)
+        if not model_type:
+            model_type = model.get("model_type", None)
+        # else use the model type selected by user.
+        assert model_type is not None, "Missing model_type."
+        print("--------------")
+        print("model arch",model.arch)
+        print("model cls",model_cls)
+        model_config_path = model_cls.PRETRAINED_MODEL_CONFIG_DICT[model_type]
+        model_config = OmegaConf.create()
+        # hierarchy override, customized config > default config
+        model_config = OmegaConf.merge(
+            model_config,
+            OmegaConf.load(model_config_path),
+            {"model": config["model"]},
+        )
+        return model_config
+    @staticmethod
+    def build_runner_config(config):
+        return {"run": config.run}
+    @staticmethod
+    def build_dataset_config(config):
+        datasets = config.get("datasets", None)
+        if datasets is None:
+            raise KeyError(
+                "Expecting 'datasets' as the root key for dataset configuration."
+            )
+        dataset_config = OmegaConf.create()
+        for dataset_name in datasets:
+            print("dataset name", dataset_name)
+            builder_cls = registry.get_builder_class(dataset_name)
+            dataset_config_type = datasets[dataset_name].get("type", "default")
+            dataset_config_path = builder_cls.default_config_path(
+                type=dataset_config_type
+            )
+            # hierarchy override, customized config > default config
+            dataset_config = OmegaConf.merge(
+                dataset_config,
+                OmegaConf.load(dataset_config_path),
+                {"datasets": {dataset_name: config["datasets"][dataset_name]}},
+            )
+        return dataset_config
+    def _convert_to_dot_list(self, opts):
+        if opts is None:
+            opts = []
+        if len(opts) == 0:
+            return opts
+        has_equal = opts[0].find("=") != -1
+        if has_equal:
+            return opts
+        return [(opt + "=" + value) for opt, value in zip(opts[0::2], opts[1::2])]
+    def get_config(self):
+        return self.config
+    @property
+    def run_cfg(self):
+        return self.config.run
+    @property
+    def datasets_cfg(self):
+        return self.config.datasets
+    @property
+    def model_cfg(self):
+        return self.config.model
+    def pretty_print(self):
+        logging.info("\n=====  Running Parameters    =====")
+        logging.info(self._convert_node_to_json(self.config.run))
+        logging.info("\n======  Dataset Attributes  ======")
+        datasets = self.config.datasets
+        for dataset in datasets:
+            if dataset in self.config.datasets:
+                logging.info(f"\n======== {dataset} =======")
+                dataset_config = self.config.datasets[dataset]
+                logging.info(self._convert_node_to_json(dataset_config))
+            else:
+                logging.warning(f"No dataset named '{dataset}' in config. Skipping")
+        logging.info(f"\n======  Model Attributes  ======")
+        logging.info(self._convert_node_to_json(self.config.model))
+    def _convert_node_to_json(self, node):
+        container = OmegaConf.to_container(node, resolve=True)
+        return json.dumps(container, indent=4, sort_keys=True)
+    def to_dict(self):
+        return OmegaConf.to_container(self.config)
+def node_to_dict(node):
+    return OmegaConf.to_container(node)
+class ConfigValidator:
+    """
+    This is a preliminary implementation to centralize and validate the configuration.
+    May be altered in the future.
+    A helper class to validate configurations from yaml file.
+    This serves the following purposes:
+        1. Ensure all the options in the yaml are defined, raise error if not.
+        2. when type mismatches are found, the validator will raise an error.
+        3. a central place to store and display helpful messages for supported configurations.
+    """
+    class _Argument:
+        def __init__(self, name, choices=None, type=None, help=None):
+            self.name = name
+            self.val = None
+            self.choices = choices
+            self.type = type
+            self.help = help
+        def __str__(self):
+            s = f"{self.name}={self.val}"
+            if self.type is not None:
+                s += f", ({self.type})"
+            if self.choices is not None:
+                s += f", choices: {self.choices}"
+            if self.help is not None:
+                s += f", ({self.help})"
+            return s
+    def __init__(self, description):
+        self.description = description
+        self.arguments = dict()
+        self.parsed_args = None
+    def __getitem__(self, key):
+        assert self.parsed_args is not None, "No arguments parsed yet."
+        return self.parsed_args[key]
+    def __str__(self) -> str:
+        return self.format_help()
+    def add_argument(self, *args, **kwargs):
+        """
+        Assume the first argument is the name of the argument.
+        """
+        self.arguments[args[0]] = self._Argument(*args, **kwargs)
+    def validate(self, config=None):
+        """
+        Convert yaml config (dict-like) to list, required by argparse.
+        """
+        for k, v in config.items():
+            assert (
+                k in self.arguments
+            ), f"""{k} is not a valid argument. Support arguments are {self.format_arguments()}."""
+            if self.arguments[k].type is not None:
+                try:
+                    self.arguments[k].val = self.arguments[k].type(v)
+                except ValueError:
+                    raise ValueError(f"{k} is not a valid {self.arguments[k].type}.")
+            if self.arguments[k].choices is not None:
+                assert (
+                    v in self.arguments[k].choices
+                ), f"""{k} must be one of {self.arguments[k].choices}."""
+        return config
+    def format_arguments(self):
+        return str([f"{k}" for k in sorted(self.arguments.keys())])
+    def format_help(self):
+        # description + key-value pair string for each argument
+        help_msg = str(self.description)
+        return help_msg + ", available arguments: " + self.format_arguments()
+    def print_help(self):
+        # display help message
+        print(self.format_help())
+def create_runner_config_validator():
+    validator = ConfigValidator(description="Runner configurations")
+    validator.add_argument(
+        "runner",
+        type=str,
+        choices=["runner_base", "runner_iter"],
+        help="""Runner to use. The "runner_base" uses epoch-based training while iter-based
+            runner runs based on iters. Default: runner_base""",
+    )
+    # add argumetns for training dataset ratios
+    validator.add_argument(
+        "train_dataset_ratios",
+        type=Dict[str, float],
+        help="""Ratios of training dataset. This is used in iteration-based runner.
+        Do not support for epoch-based runner because how to define an epoch becomes tricky.
+        Default: None""",
+    )
+    validator.add_argument(
+        "max_iters",
+        type=float,
+        help="Maximum number of iterations to run.",
+    )
+    validator.add_argument(
+        "max_epoch",
+        type=int,
+        help="Maximum number of epochs to run.",
+    )
+    # add arguments for iters_per_inner_epoch
+    validator.add_argument(
+        "iters_per_inner_epoch",
+        type=float,
+        help="Number of iterations per inner epoch. This is required when runner is runner_iter.",
+    )
+    lr_scheds_choices = registry.list_lr_schedulers()
+    validator.add_argument(
+        "lr_sched",
+        type=str,
+        choices=lr_scheds_choices,
+        help="Learning rate scheduler to use, from {}".format(lr_scheds_choices),
+    )
+    task_choices = registry.list_tasks()
+    validator.add_argument(
+        "task",
+        type=str,
+        choices=task_choices,
+        help="Task to use, from {}".format(task_choices),
+    )
+    # add arguments for init_lr
+    validator.add_argument(
+        "init_lr",
+        type=float,
+        help="Initial learning rate. This will be the learning rate after warmup and before decay.",
+    )
+    # add arguments for min_lr
+    validator.add_argument(
+        "min_lr",
+        type=float,
+        help="Minimum learning rate (after decay).",
+    )
+    # add arguments for warmup_lr
+    validator.add_argument(
+        "warmup_lr",
+        type=float,
+        help="Starting learning rate for warmup.",
+    )
+    # add arguments for learning rate decay rate
+    validator.add_argument(
+        "lr_decay_rate",
+        type=float,
+        help="Learning rate decay rate. Required if using a decaying learning rate scheduler.",
+    )
+    # add arguments for weight decay
+    validator.add_argument(
+        "weight_decay",
+        type=float,
+        help="Weight decay rate.",
+    )
+    # add arguments for training batch size
+    validator.add_argument(
+        "batch_size_train",
+        type=int,
+        help="Training batch size.",
+    )
+    # add arguments for evaluation batch size
+    validator.add_argument(
+        "batch_size_eval",
+        type=int,
+        help="Evaluation batch size, including validation and testing.",
+    )
+    # add arguments for number of workers for data loading
+    validator.add_argument(
+        "num_workers",
+        help="Number of workers for data loading.",
+    )
+    # add arguments for warm up steps
+    validator.add_argument(
+        "warmup_steps",
+        type=int,
+        help="Number of warmup steps. Required if a warmup schedule is used.",
+    )
+    # add arguments for random seed
+    validator.add_argument(
+        "seed",
+        type=int,
+        help="Random seed.",
+    )
+    # add arguments for output directory
+    validator.add_argument(
+        "output_dir",
+        type=str,
+        help="Output directory to save checkpoints and logs.",
+    )
+    # add arguments for whether only use evaluation
+    validator.add_argument(
+        "evaluate",
+        help="Whether to only evaluate the model. If true, training will not be performed.",
+    )
+    # add arguments for splits used for training, e.g. ["train", "val"]
+    validator.add_argument(
+        "train_splits",
+        type=list,
+        help="Splits to use for training.",
+    )
+    # add arguments for splits used for validation, e.g. ["val"]
+    validator.add_argument(
+        "valid_splits",
+        type=list,
+        help="Splits to use for validation. If not provided, will skip the validation.",
+    )
+    # add arguments for splits used for testing, e.g. ["test"]
+    validator.add_argument(
+        "test_splits",
+        type=list,
+        help="Splits to use for testing. If not provided, will skip the testing.",
+    )
+    # add arguments for accumulating gradient for iterations
+    validator.add_argument(
+        "accum_grad_iters",
+        type=int,
+        help="Number of iterations to accumulate gradient for.",
+    )
+    # ====== distributed training ======
+    validator.add_argument(
+        "device",
+        type=str,
+        choices=["cpu", "cuda"],
+        help="Device to use. Support 'cuda' or 'cpu' as for now.",
+    )
+    validator.add_argument(
+        "world_size",
+        type=int,
+        help="Number of processes participating in the job.",
+    )
+    validator.add_argument("dist_url", type=str)
+    validator.add_argument("distributed", type=bool)
+    # add arguments to opt using distributed sampler during evaluation or not
+    validator.add_argument(
+        "use_dist_eval_sampler",
+        type=bool,
+        help="Whether to use distributed sampler during evaluation or not.",
+    )
+    # ====== task specific ======
+    # generation task specific arguments
+    # add arguments for maximal length of text output
+    validator.add_argument(
+        "max_len",
+        type=int,
+        help="Maximal length of text output.",
+    )
+    # add arguments for minimal length of text output
+    validator.add_argument(
+        "min_len",
+        type=int,
+        help="Minimal length of text output.",
+    )
+    # add arguments number of beams
+    validator.add_argument(
+        "num_beams",
+        type=int,
+        help="Number of beams used for beam search.",
+    )
+    # vqa task specific arguments
+    # add arguments for number of answer candidates
+    validator.add_argument(
+        "num_ans_candidates",
+        type=int,
+        help="""For ALBEF and BLIP, these models first rank answers according to likelihood to select answer candidates.""",
+    )
+    # add arguments for inference method
+    validator.add_argument(
+        "inference_method",
+        type=str,
+        choices=["genearte", "rank"],
+        help="""Inference method to use for question answering. If rank, requires a answer list.""",
+    )
+    # ====== model specific ======
+    validator.add_argument(
+        "k_test",
+        type=int,
+        help="Number of top k most similar samples from ITC/VTC selection to be tested.",
+    )
+    return validator

minigpt4/common/dist_utils.py ADDED Viewed

	@@ -0,0 +1,146 @@

+"""
+ Copyright (c) 2022, salesforce.com, inc.
+ All rights reserved.
+ SPDX-License-Identifier: BSD-3-Clause
+ For full license text, see the LICENSE_Lavis file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+"""
+import datetime
+import functools
+import os
+import torch
+import torch.distributed as dist
+import timm.models.hub as timm_hub
+def setup_for_distributed(is_master):
+    """
+    This function disables printing when not in master process
+    """
+    import builtins as __builtin__
+    builtin_print = __builtin__.print
+    def print(*args, **kwargs):
+        force = kwargs.pop("force", False)
+        if is_master or force:
+            builtin_print(*args, **kwargs)
+    __builtin__.print = print
+def is_dist_avail_and_initialized():
+    if not dist.is_available():
+        return False
+    if not dist.is_initialized():
+        return False
+    return True
+def get_world_size():
+    if not is_dist_avail_and_initialized():
+        return 1
+    return dist.get_world_size()
+def get_rank():
+    if not is_dist_avail_and_initialized():
+        return 0
+    return dist.get_rank()
+def is_main_process():
+    return get_rank() == 0
+def init_distributed_mode(args):
+    if args.distributed is False:
+        print("Not using distributed mode")
+        args.rank = 0
+        return
+    if 'LOCAL_RANK' not in os.environ:
+        os.environ['LOCAL_RANK'] = str(args.local_rank)
+    if "RANK" in os.environ and "WORLD_SIZE" in os.environ:
+        args.rank = int(os.environ["RANK"])
+        args.world_size = int(os.environ["WORLD_SIZE"])
+        args.gpu = int(os.environ["LOCAL_RANK"])
+    elif "SLURM_PROCID" in os.environ:
+        args.rank = int(os.environ["SLURM_PROCID"])
+        args.gpu = args.rank % torch.cuda.device_count()
+    else:
+        print("Not using distributed mode")
+        args.distributed = False
+        args.rank = 0
+        return
+    args.distributed = True
+    torch.cuda.set_device(args.gpu)
+    args.dist_backend = "nccl"
+    print(
+        "| distributed init (rank {}, world {}): {}".format(
+            args.rank, args.world_size, args.dist_url
+        ),
+        flush=True,
+    )
+    torch.distributed.init_process_group(
+        backend=args.dist_backend,
+        init_method=args.dist_url,
+        world_size=args.world_size,
+        rank=args.rank,
+        timeout=datetime.timedelta(
+            days=365
+        ),  # allow auto-downloading and de-compressing
+    )
+    torch.distributed.barrier()
+    setup_for_distributed(args.rank == 0)
+def get_dist_info():
+    if torch.__version__ < "1.0":
+        initialized = dist._initialized
+    else:
+        initialized = dist.is_initialized()
+    if initialized:
+        rank = dist.get_rank()
+        world_size = dist.get_world_size()
+    else:  # non-distributed training
+        rank = 0
+        world_size = 1
+    return rank, world_size
+def main_process(func):
+    @functools.wraps(func)
+    def wrapper(*args, **kwargs):
+        rank, _ = get_dist_info()
+        if rank == 0:
+            return func(*args, **kwargs)
+    return wrapper
+def download_cached_file(url, check_hash=True, progress=False):
+    """
+    Download a file from a URL and cache it locally. If the file already exists, it is not downloaded again.
+    If distributed, only the main process downloads the file, and the other processes wait for the file to be downloaded.
+    """
+    def get_cached_file_path():
+        # a hack to sync the file path across processes
+        parts = torch.hub.urlparse(url)
+        filename = os.path.basename(parts.path)
+        cached_file = os.path.join(timm_hub.get_cache_dir(), filename)
+        return cached_file
+    if is_main_process():
+        timm_hub.download_cached_file(url, check_hash, progress)
+    if is_dist_avail_and_initialized():
+        dist.barrier()
+    return get_cached_file_path()