diff --git a/text_content/docs_audio_88b326ce.txt b/text_content/docs_audio_88b326ce.txt new file mode 100644 index 0000000000000000000000000000000000000000..f489d6d80378fe1630f070968b42acbfa681fb30 --- /dev/null +++ b/text_content/docs_audio_88b326ce.txt @@ -0,0 +1,5 @@ +URL: https://ai.google.dev/gemini-api/docs/audio#inline-audio +Title: Audio understanding  |  Gemini API  |  Google AI for Developers +================================================== + +Audio understanding | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Audio understanding Gemini can analyze and understand audio input, enabling use cases like the following: Describe, summarize, or answer questions about audio content. Provide a transcription of the audio. Analyze specific segments of the audio. This guide shows you how to use the Gemini API to generate a text response to audio input. Before you begin Before calling the Gemini API, ensure you have your SDK of choice installed, and a Gemini API key configured and ready to use. Input audio You can provide audio data to Gemini in the following ways: Upload an audio file before making a request to generateContent . Pass inline audio data with the request to generateContent . Upload an audio file You can use the Files API to upload an audio file. Always use the Files API when the total request size (including the files, text prompt, system instructions, etc.) is larger than 20 MB. The following code uploads an audio file and then uses the file in a call to generateContent . Python from google import genai client = genai . Client () myfile = client . files . upload ( file = "path/to/sample.mp3" ) response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = [ "Describe this audio clip" , myfile ] ) print ( response . text ) JavaScript import { GoogleGenAI , createUserContent , createPartFromUri , } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const myfile = await ai . files . upload ({ file : "path/to/sample.mp3" , config : { mimeType : "audio/mp3" }, }); const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : createUserContent ([ createPartFromUri ( myfile . uri , myfile . mimeType ), "Describe this audio clip" , ]), }); console . log ( response . text ); } await main (); Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } localAudioPath := "/path/to/sample.mp3" uploadedFile , _ := client . Files . UploadFromPath ( ctx , localAudioPath , nil , ) parts := [] * genai . Part { genai . NewPartFromText ( "Describe this audio clip" ), genai . NewPartFromURI ( uploadedFile . URI , uploadedFile . MIMEType ), } contents := [] * genai . Content { genai . NewContentFromParts ( parts , genai . RoleUser ), } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , contents , nil , ) fmt . Println ( result . Text ()) } REST AUDIO_PATH = "path/to/sample.mp3" MIME_TYPE = $( file -b --mime-type " ${ AUDIO_PATH } " ) NUM_BYTES = $( wc -c < " ${ AUDIO_PATH } " ) DISPLAY_NAME = AUDIO tmp_header_file = upload-header.tmp # Initial resumable request defining metadata. # The upload url is in the response headers dump them to a file. curl "https://generativelanguage.googleapis.com/upload/v1beta/files" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -D upload-header.tmp \ -H "X-Goog-Upload-Protocol: resumable" \ -H "X-Goog-Upload-Command: start" \ -H "X-Goog-Upload-Header-Content-Length: ${ NUM_BYTES } " \ -H "X-Goog-Upload-Header-Content-Type: ${ MIME_TYPE } " \ -H "Content-Type: application/json" \ -d "{'file': {'display_name': ' ${ DISPLAY_NAME } '}}" 2 > /dev/null upload_url = $( grep -i "x-goog-upload-url: " " ${ tmp_header_file } " | cut -d " " -f2 | tr -d "\r" ) rm " ${ tmp_header_file } " # Upload the actual bytes. curl " ${ upload_url } " \ -H "Content-Length: ${ NUM_BYTES } " \ -H "X-Goog-Upload-Offset: 0" \ -H "X-Goog-Upload-Command: upload, finalize" \ --data-binary "@ ${ AUDIO_PATH } " 2 > /dev/null > file_info.json file_uri = $( jq ".file.uri" file_info.json ) echo file_uri = $file_uri # Now generate content using that file curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [{ "parts":[ {"text": "Describe this audio clip"}, {"file_data":{"mime_type": "${MIME_TYPE}", "file_uri": ' $file_uri '}}] }] }' 2 > /dev/null > response.json cat response.json echo jq ".candidates[].content.parts[].text" response.json To learn more about working with media files, see Files API . Pass audio data inline Instead of uploading an audio file, you can pass inline audio data in the request to generateContent : Python from google.genai import types with open ( 'path/to/small-sample.mp3' , 'rb' ) as f : audio_bytes = f . read () response = client . models . generate_content ( model = 'gemini-2.5-flash' , contents = [ 'Describe this audio clip' , types . Part . from_bytes ( data = audio_bytes , mime_type = 'audio/mp3' , ) ] ) print ( response . text ) JavaScript import { GoogleGenAI } from "@google/genai" ; import * as fs from "node:fs" ; const ai = new GoogleGenAI ({}); const base64AudioFile = fs . readFileSync ( "path/to/small-sample.mp3" , { encoding : "base64" , }); const contents = [ { text : "Please summarize the audio." }, { inlineData : { mimeType : "audio/mp3" , data : base64AudioFile , }, }, ]; const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : contents , }); console . log ( response . text ); Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } audioBytes , _ := os . ReadFile ( "/path/to/small-sample.mp3" ) parts := [] * genai . Part { genai . NewPartFromText ( "Describe this audio clip" ), & genai . Part { InlineData : & genai . Blob { MIMEType : "audio/mp3" , Data : audioBytes , }, }, } contents := [] * genai . Content { genai . NewContentFromParts ( parts , genai . RoleUser ), } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , contents , nil , ) fmt . Println ( result . Text ()) } A few things to keep in mind about inline audio data: The maximum request size is 20 MB, which includes text prompts, system instructions, and files provided inline. If your file's size will make the total request size exceed 20 MB, then use the Files API to upload an audio file for use in the request. If you're using an audio sample multiple times, it's more efficient to upload an audio file . Get a transcript To get a transcript of audio data, just ask for it in the prompt: Python myfile = client . files . upload ( file = 'path/to/sample.mp3' ) prompt = 'Generate a transcript of the speech.' response = client . models . generate_content ( model = 'gemini-2.5-flash' , contents = [ prompt , myfile ] ) print ( response . text ) JavaScript import { GoogleGenAI , createUserContent , createPartFromUri , } from "@google/genai" ; const ai = new GoogleGenAI ({}); const myfile = await ai . files . upload ({ file : "path/to/sample.mp3" , config : { mimeType : "audio/mpeg" }, }); const result = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : createUserContent ([ createPartFromUri ( myfile . uri , myfile . mimeType ), "Generate a transcript of the speech." , ]), }); console . log ( "result.text=" , result . text ); Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } localAudioPath := "/path/to/sample.mp3" uploadedFile , _ := client . Files . UploadFromPath ( ctx , localAudioPath , nil , ) parts := [] * genai . Part { genai . NewPartFromText ( "Generate a transcript of the speech." ), genai . NewPartFromURI ( uploadedFile . URI , uploadedFile . MIMEType ), } contents := [] * genai . Content { genai . NewContentFromParts ( parts , genai . RoleUser ), } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , contents , nil , ) fmt . Println ( result . Text ()) } Refer to timestamps You can refer to specific sections of an audio file using timestamps of the form MM:SS . For example, the following prompt requests a transcript that Starts at 2 minutes 30 seconds from the beginning of the file. Ends at 3 minutes 29 seconds from the beginning of the file. Python # Create a prompt containing timestamps. prompt = "Provide a transcript of the speech from 02:30 to 03:29." JavaScript // Create a prompt containing timestamps. const prompt = "Provide a transcript of the speech from 02:30 to 03:29." Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } localAudioPath := "/path/to/sample.mp3" uploadedFile , _ := client . Files . UploadFromPath ( ctx , localAudioPath , nil , ) parts := [] * genai . Part { genai . NewPartFromText ( "Provide a transcript of the speech " + "between the timestamps 02:30 and 03:29." ), genai . NewPartFromURI ( uploadedFile . URI , uploadedFile . MIMEType ), } contents := [] * genai . Content { genai . NewContentFromParts ( parts , genai . RoleUser ), } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , contents , nil , ) fmt . Println ( result . Text ()) } Count tokens Call the countTokens method to get a count of the number of tokens in an audio file. For example: Python response = client . models . count_tokens ( model = 'gemini-2.5-flash' , contents = [ myfile ] ) print ( response ) JavaScript import { GoogleGenAI , createUserContent , createPartFromUri , } from "@google/genai" ; const ai = new GoogleGenAI ({}); const myfile = await ai . files . upload ({ file : "path/to/sample.mp3" , config : { mimeType : "audio/mpeg" }, }); const countTokensResponse = await ai . models . countTokens ({ model : "gemini-2.5-flash" , contents : createUserContent ([ createPartFromUri ( myfile . uri , myfile . mimeType ), ]), }); console . log ( countTokensResponse . totalTokens ); Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } localAudioPath := "/path/to/sample.mp3" uploadedFile , _ := client . Files . UploadFromPath ( ctx , localAudioPath , nil , ) parts := [] * genai . Part { genai . NewPartFromURI ( uploadedFile . URI , uploadedFile . MIMEType ), } contents := [] * genai . Content { genai . NewContentFromParts ( parts , genai . RoleUser ), } tokens , _ := client . Models . CountTokens ( ctx , "gemini-2.5-flash" , contents , nil , ) fmt . Printf ( "File %s is %d tokens\n" , localAudioPath , tokens . TotalTokens ) } Supported audio formats Gemini supports the following audio format MIME types: WAV - audio/wav MP3 - audio/mp3 AIFF - audio/aiff AAC - audio/aac OGG Vorbis - audio/ogg FLAC - audio/flac Technical details about audio Gemini represents each second of audio as 32 tokens; for example, one minute of audio is represented as 1,920 tokens. Gemini can "understand" non-speech components, such as birdsong or sirens. The maximum supported length of audio data in a single prompt is 9.5 hours. Gemini doesn't limit the number of audio files in a single prompt; however, the total combined length of all audio files in a single prompt can't exceed 9.5 hours. Gemini downsamples audio files to a 16 Kbps data resolution. If the audio source contains multiple channels, Gemini combines those channels into a single channel. What's next This guide shows how to generate text in response to audio data. To learn more, see the following resources: File prompting strategies : The Gemini API supports prompting with text, image, audio, and video data, also known as multimodal prompting. System instructions : System instructions let you steer the behavior of the model based on your specific needs and use cases. Safety guidance : Sometimes generative AI models produce unexpected outputs, such as outputs that are inaccurate, biased, or offensive. Post-processing and human evaluation are essential to limit the risk of harm from such outputs. Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-06-27 UTC. \ No newline at end of file diff --git a/text_content/docs_batch-mode_44d7186d.txt b/text_content/docs_batch-mode_44d7186d.txt new file mode 100644 index 0000000000000000000000000000000000000000..de2db0820caf6d756565c3e5785e2a04835f3880 --- /dev/null +++ b/text_content/docs_batch-mode_44d7186d.txt @@ -0,0 +1,5 @@ +URL: https://ai.google.dev/gemini-api/docs/batch-mode +Title: Batch Mode  |  Gemini API  |  Google AI for Developers +================================================== + +Batch Mode | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Batch Mode The Gemini API's Batch Mode is designed to process large volumes of requests asynchronously at 50% of the standard cost . The target turnaround time is 24 hours, but in majority of cases, it is much quicker. Use Batch Mode for large-scale, non-urgent tasks such as data pre-processing or running evaluations where an immediate response is not required. Note: You can use Batch Mode with the Gemini API Python SDK or the REST API. Support for Batch Mode in the Gemini API JavaScript SDK is coming soon. Getting Started This section helps you get started with submitting your first requests in batch mode. Creating a batch job You have two ways to submit your requests in Batch Mode: Inline Requests : A list of GenerateContentRequest objects directly included in your batch creation request. This is suitable for smaller batches that keep the total request size under 20MB. The output returned from the model is a list of inlineResponse objects. Input File : A JSON Lines (JSONL) file where each line contains a complete GenerateContentRequest object. This method is recommended for larger requests. The output returned from the model is a JSONL file where each line is either a GenerateContentResponse or a status object. Inline requests For a small number of requests, you can directly embed the GenerateContentRequest objects within your BatchGenerateContentRequest . The following example calls the BatchGenerateContent method with inline requests: Python from google import genai from google.genai import types client = genai . Client () # A list of dictionaries, where each is a GenerateContentRequest inline_requests = [ { 'contents' : [{ 'parts' : [{ 'text' : 'Tell me a one-sentence joke.' }], 'role' : 'user' }] }, { 'contents' : [{ 'parts' : [{ 'text' : 'Why is the sky blue?' }], 'role' : 'user' }] } ] inline_batch_job = client . batches . create ( model = "models/gemini-2.5-flash" , src = inline_requests , config = { 'display_name' : "inlined-requests-job-1" , }, ) print ( f "Created batch job: { inline_batch_job . name } " ) REST curl https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:batchGenerateContent \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -X POST \ -H "Content-Type:application/json" \ -d '{ "batch": { "display_name": "my-batch-requests", "input_config": { "requests": { "requests": [ { "request": {"contents": [{"parts": [{"text": "Describe the process of photosynthesis."}]}]}, "metadata": { "key": "request-1" } }, { "request": {"contents": [{"parts": [{"text": "Describe the process of photosynthesis."}]}]}, "metadata": { "key": "request-2" } } ] } } } }' You can use any requests you would use in non-batch (or interactive) mode. For example, you could specify the temperature, system instructions or even pass in other modalities. The following example shows some example inline requests that contain a system instruction for one of the requests: inline_requests_list = [ { 'contents' : [{ 'parts' : [{ 'text' : 'Write a short poem about a cloud.' }]}]}, { 'contents' : [{ 'parts' : [{ 'text' : 'Write a short poem about a cat.' }]}], 'system_instructions' : { 'parts' : [{ 'text' : 'You are a cat. Your name is Neko.' }]}} ] Similarly can also specify tools to use for a request. The following example shows a request that enables the Google Search tool : inline_requests_list = [ { 'contents' : [{ 'parts' : [{ 'text' : 'Who won the euro 1998?' }]}]}, { 'contents' : [{ 'parts' : [{ 'text' : 'Who won the euro 2025?' }]}], 'tools' : [{ 'google_search ' : {}}]} ] Input file For larger sets of requests, prepare a JSON Lines (JSONL) file. Each line in this file must be a JSON object containing a user-defined key and a request object, where the request is a valid GenerateContentRequest object. The user-defined key is used in the response to indicate which output is the result of which request. For example, the request with the key defined as request-1 will have its response annotated with the same key name. This file is uploaded using the File API . The maximum allowed file size for an input file is 2GB. The following is an example of a JSONL file. You can save it in a file named my-batch-requests.json : { "key" : "request-1" , "request" : { "contents" : [{ "parts" : [{ "text" : "Describe the process of photosynthesis." }]}], "generation_config" : { "temperature" : 0.7 }}} { "key" : "request-2" , "request" : { "contents" : [{ "parts" : [{ "text" : "What are the main ingredients in a Margherita pizza?" }]}]}} Similarly to inline requests, you can specify other parameters like system instructions, tools or other configurations in each request JSON. You can upload this file using the File API as shown in the following example. If you are working with multimodal input, you can reference other uploaded files within your JSONL file. Python from google import genai from google.genai import types client = genai . Client () # Create a sample JSONL file with open ( "my-batch-requests.jsonl" , "w" ) as f : requests = [ { "key" : "request-1" , "request" : { "contents" : [{ "parts" : [{ "text" : "Describe the process of photosynthesis." }]}]}}, { "key" : "request-2" , "request" : { "contents" : [{ "parts" : [{ "text" : "What are the main ingredients in a Margherita pizza?" }]}]}} ] for req in requests : f . write ( json . dumps ( req ) + " \n " ) # Upload the file to the File API uploaded_file = client . files . upload ( file = 'my-batch-requests.jsonl' , config = types . UploadFileConfig ( display_name = 'my-batch-requests' , mime_type = 'jsonl' ) ) print ( f "Uploaded file: { uploaded_file . name } " ) REST tmp_batch_input_file = batch_input.tmp echo -e '{"contents": [{"parts": [{"text": "Describe the process of photosynthesis."}]}], "generationConfig": {"temperature": 0.7}}\n{"contents": [{"parts": [{"text": "What are the main ingredients in a Margherita pizza?"}]}]}' > batch_input.tmp MIME_TYPE = $( file -b --mime-type " ${ tmp_batch_input_file } " ) NUM_BYTES = $( wc -c < " ${ tmp_batch_input_file } " ) DISPLAY_NAME = BatchInput tmp_header_file = upload-header.tmp # Initial resumable request defining metadata. # The upload url is in the response headers dump them to a file. curl "https://generativelanguage.googleapis.com/upload/v1beta/files \ -D " ${ tmp_header_file } " \ -H " x-goog-api-key: $GEMINI_API_KEY " \ -H " X-Goog-Upload-Protocol: resumable " \ -H " X-Goog-Upload-Command: start " \ -H " X-Goog-Upload-Header-Content-Length: ${ NUM_BYTES } " \ -H " X-Goog-Upload-Header-Content-Type: ${ MIME_TYPE } " \ -H " Content-Type: application/jsonl " \ -d " { 'file' : { 'display_name' : '${DISPLAY_NAME}' }} " 2> /dev/null upload_url= $( grep -i "x-goog-upload-url: " " ${ tmp_header_file } " | cut -d " " -f2 | tr -d "\r" ) rm " ${ tmp_header_file } " # Upload the actual bytes. curl " ${ upload_url } " \ -H " Content-Length: ${ NUM_BYTES } " \ -H " X-Goog-Upload-Offset: 0 " \ -H " X-Goog-Upload-Command: upload, finalize " \ --data-binary " @ ${ tmp_batch_input_file } " 2> /dev/null > file_info.json file_uri= $( jq ".file.uri" file_info.json ) The following example calls the BatchGenerateContent method with the input file uploaded using File API: Python # Assumes `uploaded_file` is the file object from the previous step file_batch_job = client . batches . create ( model = "gemini-2.5-flash" , src = uploaded_file . name , config = { 'display_name' : "file-upload-job-1" , }, ) print ( f "Created batch job: { file_batch_job . name } " ) REST BATCH_INPUT_FILE = 'files/123456' # File ID curl https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:batchGenerateContent \ -X POST \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H "Content-Type:application/json" \ -d "{ 'batch': { 'display_name': 'my-batch-requests', 'input_config': { 'requests': { 'file_name': ${ BATCH_INPUT_FILE } } } } }" When you create a batch job, you will get a job name returned. Use this name for monitoring the job status as well as retrieving the results once the job completes. The following is an example output that contains a job name: Created batch job from file: batches/123456789 Monitoring job status Use the operation name obtained when creating the batch job to poll its status. The state field of the batch job will indicate its current status. A batch job can be in one of the following states: JOB_STATE_PENDING : The job has been created and is waiting to be processed by the service. JOB_STATE_SUCCEEDED : The job completed successfully. You can now retrieve the results. JOB_STATE_FAILED : The job failed. Check the error details for more information. JOB_STATE_CANCELLED : The job was cancelled by the user. You can poll the job status periodically to check for completion. Python # Use the name of the job you want to check # e.g., inline_batch_job.name from the previous step job_name = "YOUR_BATCH_JOB_NAME" # (e.g. 'batches/your-batch-id') batch_job = client . batches . get ( name = job_name ) completed_states = set ([ 'JOB_STATE_SUCCEEDED' , 'JOB_STATE_FAILED' , 'JOB_STATE_CANCELLED' , ]) print ( f "Polling status for job: { job_name } " ) batch_job = client . batches . get ( name = job_name ) # Initial get while batch_job . state . name not in completed_states : print ( f "Current state: { batch_job . state . name } " ) time . sleep ( 30 ) # Wait for 30 seconds before polling again batch_job = client . batches . get ( name = job_name ) print ( f "Job finished with state: { batch_job . state . name } " ) if batch_job . state . name == 'JOB_STATE_FAILED' : print ( f "Error: { batch_job . error } " ) Retrieving results Once the job status indicates your batch job has succeeded, the results are available in the response field. Python import json # Use the name of the job you want to check # e.g., inline_batch_job.name from the previous step job_name = "YOUR_BATCH_JOB_NAME" batch_job = client . batches . get ( name = job_name ) if batch_job . state . name == 'JOB_STATE_SUCCEEDED' : # If batch job was created with a file if batch_job . dest and batch_job . dest . file_name : # Results are in a file result_file_name = batch_job . dest . file_name print ( f "Results are in file: { result_file_name } " ) print ( "Downloading result file content..." ) file_content = client . files . download ( file = result_file_name ) # Process file_content (bytes) as needed print ( file_content . decode ( 'utf-8' )) # If batch job was created with inline request elif batch_job . dest and batch_job . dest . inlined_responses : # Results are inline print ( "Results are inline:" ) for i , inline_response in enumerate ( batch_job . dest . inlined_responses ): print ( f "Response { i + 1 } :" ) if inline_response . response : # Accessing response, structure may vary. try : print ( inline_response . response . text ) except AttributeError : print ( inline_response . response ) # Fallback elif inline_response . error : print ( f "Error: { inline_response . error } " ) else : print ( "No results found (neither file nor inline)." ) else : print ( f "Job did not succeed. Final state: { batch_job . state . name } " ) if batch_job . error : print ( f "Error: { batch_job . error } " ) REST BATCH_NAME = "batches/123456" # Your batch job name curl https://generativelanguage.googleapis.com/v1beta/ $BATCH_NAME \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H "Content-Type:application/json" 2 > /dev/null > batch_status.json if jq -r '.done' batch_status.json | grep -q "false" ; then echo "Batch has not finished processing" fi batch_state = $( jq -r '.metadata.state' batch_status.json ) if [[ $batch_state = "JOB_STATE_SUCCEEDED" ]] ; then if [[ $( jq '.response | has("inlinedResponses")' batch_status.json ) = "true" ]] ; then jq -r '.response.inlinedResponses' batch_status.json exit fi responses_file_name = $( jq -r '.response.responsesFile' batch_status.json ) curl https://generativelanguage.googleapis.com/download/v1beta/ $responses_file_name :download?alt = media \ -H "x-goog-api-key: $GEMINI_API_KEY " 2 > /dev/null elif [[ $batch_state = "JOB_STATE_FAILED" ]] ; then jq '.error' batch_status.json elif [[ $batch_state == "JOB_STATE_CANCELLED" ]] ; then echo "Batch was cancelled by the user" fi Cancelling a batch job You can cancel an ongoing batch job using its name. When a job is canceled, it stops processing new requests. Python # Cancel a batch job client . batches . cancel ( name = batch_job_to_cancel . name ) REST BATCH_NAME = "batches/123456" # Your batch job name # Cancel the batch curl https://generativelanguage.googleapis.com/v1beta/ $BATCH_NAME :cancel \ -H "x-goog-api-key: $GEMINI_API_KEY " \ # Confirm that the status of the batch after cancellation is JOB_STATE_CANCELLED curl https://generativelanguage.googleapis.com/v1beta/ $BATCH_NAME \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H "Content-Type:application/json" 2 > /dev/null | jq -r '.metadata.state' Deleting a batch job You can delete an existing batch job using its name. When a job is deleted, it stops processing new requests and is removed from the list of batch jobs. Python # Delete a batch job client . batches . delete ( name = batch_job_to_delete . name ) REST BATCH_NAME = "batches/123456" # Your batch job name # Cancel the batch curl https://generativelanguage.googleapis.com/v1beta/ $BATCH_NAME :delete \ -H "x-goog-api-key: $GEMINI_API_KEY " \ Technical details Supported models: Batch Mode supports a range of Gemini models. Refer to the Models page for the latest list of compatible models. The supported modalities for Batch Mode are the same as what's supported on the interactive (or non-batch mode) API. Pricing: Batch Mode usage is priced at 50% of the standard interactive API cost for the equivalent model. Service Level Objective (SLO): Batch jobs are designed to complete within a 24-hour turnaround time. Many jobs may complete much faster depending on their size and current system load. Caching: Context caching is enabled for batch requests. If a request in your batch results in a cache hit, the cached tokens are priced the same as for non-batch mode traffic. Best practices Use input files for large requests: For a large number of requests, always use the file input method for better manageability and to avoid hitting request size limits for the BatchGenerateContent call itself. Note that there's a the 2GB file size limit per input file. Error handling: Check the batchStats for failedRequestCount after a job completes. If using file output, parse each line to check if it's a GenerateContentResponse or a status object indicating an error for that specific request. Submit jobs once: The creation of a batch job is not idempotent. If you send the same creation request twice, two separate batch jobs will be created. Break up very large batches: While the target turnaround time is 24 hours, actual processing time can vary based on system load and job size. For large jobs, consider breaking them into smaller batches if intermediate results are needed sooner. What's next Check out the batch mode notebook for more examples. Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-07 UTC. \ No newline at end of file diff --git a/text_content/docs_billing_d3ffaced.txt b/text_content/docs_billing_d3ffaced.txt new file mode 100644 index 0000000000000000000000000000000000000000..6a15b392363a95d5c655fe9797a1436a0ccd2b9f --- /dev/null +++ b/text_content/docs_billing_d3ffaced.txt @@ -0,0 +1,5 @@ +URL: https://ai.google.dev/gemini-api/docs/billing +Title: Billing  |  Gemini API  |  Google AI for Developers +================================================== + +Billing | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Billing This guide provides an overview of different Gemini API billing options, explains how to enable billing and monitor usage, and provides answers to frequently asked questions (FAQs) about billing. Upgrade to the Gemini API paid tier About billing Billing for the Gemini API is based on two pricing tiers: free of charge (or free ) and pay-as-you-go (or paid ). Pricing and rate limits differ between these tiers and also vary by model. You can check out the rate limits and pricing pages for more into. For a model-by-model breakdown of capabilities, see the Gemini models page . How to request an upgrade To transition from the free tier to the pay-as-you-go plan, you need to enable billing for your Google Cloud project. The button you see in Google AI Studio depends on your project's current plan. If you're on the free tier, you'll see a Set up Billing button for your project. If you're already on the paid tier and meet the criteria for a plan change, you might see an Upgrade button. To start the process, follow these steps: Go to the AI Studio API keys page . Find the project you want to move to the paid plan and click either Set up Billing or Upgrade , depending on the button displayed. The next step depends on the button you clicked: If you clicked Set up Billing: You'll be redirected to the Google Cloud console to link a billing account to your project. Follow the on-screen instructions to complete the process. If you clicked Upgrade: The system will automatically verify your project's eligibility. If your project meets all the requirements, it will be instantly upgraded to the next tier. Why use the paid tier? When you enable billing and use the paid tier, you benefit from higher rate limits , and your prompts and responses aren't used to improve Google products. For more information on data use for paid services, see the terms of service . Cloud Billing The Gemini API uses Cloud Billing for billing services. To use the paid tier, you must set up Cloud Billing on your cloud project. After you've enabled Cloud Billing, you can use Cloud Billing tools to track spending, understand costs, make payments, and access Cloud Billing support. Enable billing You can enable Cloud Billing starting from Google AI Studio: Open Google AI Studio . In the bottom of the left sidebar, select Settings > Plan information . Click Set up Billing for your chosen project to enable Cloud Billing. Monitor usage After you enable Cloud Billing, you can monitor your usage of the Gemini API in the Google Cloud console . The service name for the API is generativelanguage.googleapis.com , and in the console the Gemini API is also referred to as the Generative Language API . To learn more, see the Google Cloud documentation on monitoring API usage . Frequently asked questions This section provides answers to frequently asked questions. What am I billed for? Gemini API pricing is based on the following: Input token count Output token count Cached token count Cached token storage duration For pricing information, see the pricing page . Where can I view my quota? You can view your quota and system limits in the Google Cloud console . How do I request more quota? To request more quota, follow the instructions at How to request an upgrade . Can I use the Gemini API for free in EEA (including EU), the UK, and CH? Yes, we make the free tier and paid tier available in many regions . If I set up billing with the Gemini API, will I be charged for my Google AI Studio usage? No, Google AI Studio usage remains free of charge regardless of if you set up billing across all supported regions. Can I use 1M tokens in the free tier? The free tier for Gemini API differs based on the model selected. For now, you can try the 1M token context window in the following ways: In Google AI Studio With pay-as-you-go plans With free-of-charge plans for select models See the latest free-of-charge rate limits per model on rate limits page . How can I calculate the number of tokens I'm using? Use the GenerativeModel.count_tokens method to count the number of tokens. Refer to the Tokens guide to learn more about tokens. Can I use my Google Cloud credits with the Gemini API? Yes, Google Cloud credits can be used towards Gemini API usage. How is billing handled? Billing for the Gemini API is handled by the Cloud Billing system. Am I charged for failed requests? If your request fails with a 400 or 500 error, you won't be charged for the tokens used. However, the request will still count against your quota. Is there a charge for fine-tuning the models? Model tuning is free, but inference on tuned models is charged at the same rate as the base models. Is GetTokens billed? Requests to the GetTokens API are not billed, and they don't count against inference quota. How is my Google AI Studio data handled if I have a paid API account? Refer to the terms for details on how data is handled when Cloud billing is enabled (see "How Google Uses Your Data" under "Paid Services"). Note that your Google AI Studio prompts are treated under the same "Paid Services" terms so long as at least 1 API project has billing enabled, which you can validate on the Gemini API Key page if you see any projects marked as "Paid" under "Plan". Where can I get help with billing? To get help with billing, see Get Cloud Billing support . Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-05-31 UTC. \ No newline at end of file diff --git a/text_content/docs_document-processing_793c51fa.txt b/text_content/docs_document-processing_793c51fa.txt new file mode 100644 index 0000000000000000000000000000000000000000..379de29009c49d32101060ebeaa0edbe2a64caa9 --- /dev/null +++ b/text_content/docs_document-processing_793c51fa.txt @@ -0,0 +1,5 @@ +URL: https://ai.google.dev/gemini-api/docs/document-processing#main-content +Title: Document understanding  |  Gemini API  |  Google AI for Developers +================================================== + +Document understanding | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Document understanding Gemini models can process documents in PDF format, using native vision to understand entire document contexts. This goes beyond simple text extraction, allowing Gemini to: Analyze and interpret content, including text, images, diagrams, charts, and tables, even in long documents up to 1000 pages. Extract information into structured output formats. Summarize and answer questions based on both the visual and textual elements in a document. Transcribe document content (e.g. to HTML), preserving layouts and formatting, for use in downstream applications. Passing inline PDF data You can pass inline PDF data in the request to generateContent . For PDF payloads under 20MB, you can choose between uploading base64 encoded documents or directly uploading locally stored files. The following example shows you how to fetch a PDF from a URL and convert it to bytes for processing: Python from google import genai from google.genai import types import httpx client = genai . Client () doc_url = "https://discovery.ucl.ac.uk/id/eprint/10089234/1/343019_3_art_0_py4t4l_convrt.pdf" # Retrieve and encode the PDF byte doc_data = httpx . get ( doc_url ) . content prompt = "Summarize this document" response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = [ types . Part . from_bytes ( data = doc_data , mime_type = 'application/pdf' , ), prompt ]) print ( response . text ) JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({ apiKey : "GEMINI_API_KEY" }); async function main () { const pdfResp = await fetch ( 'https://discovery.ucl.ac.uk/id/eprint/10089234/1/343019_3_art_0_py4t4l_convrt.pdf' ) . then (( response ) = > response . arrayBuffer ()); const contents = [ { text : "Summarize this document" }, { inlineData : { mimeType : 'application/pdf' , data : Buffer . from ( pdfResp ). toString ( "base64" ) } } ]; const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : contents }); console . log ( response . text ); } main (); Go package main import ( "context" "fmt" "io" "net/http" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , _ := genai . NewClient ( ctx , & genai . ClientConfig { APIKey : os . Getenv ( "GEMINI_API_KEY" ), Backend : genai . BackendGeminiAPI , }) pdfResp , _ := http . Get ( "https://discovery.ucl.ac.uk/id/eprint/10089234/1/343019_3_art_0_py4t4l_convrt.pdf" ) var pdfBytes [] byte if pdfResp != nil && pdfResp . Body != nil { pdfBytes , _ = io . ReadAll ( pdfResp . Body ) pdfResp . Body . Close () } parts := [] * genai . Part { & genai . Part { InlineData : & genai . Blob { MIMEType : "application/pdf" , Data : pdfBytes , }, }, genai . NewPartFromText ( "Summarize this document" ), } contents := [] * genai . Content { genai . NewContentFromParts ( parts , genai . RoleUser ), } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , contents , nil , ) fmt . Println ( result . Text ()) } REST DOC_URL = "https://discovery.ucl.ac.uk/id/eprint/10089234/1/343019_3_art_0_py4t4l_convrt.pdf" PROMPT = "Summarize this document" DISPLAY_NAME = "base64_pdf" # Download the PDF wget -O " ${ DISPLAY_NAME } .pdf" " ${ DOC_URL } " # Check for FreeBSD base64 and set flags accordingly if [[ " $( base64 --version 2>&1 ) " = * "FreeBSD" * ]] ; then B64FLAGS = "--input" else B64FLAGS = "-w0" fi # Base64 encode the PDF ENCODED_PDF = $( base64 $B64FLAGS " ${ DISPLAY_NAME } .pdf" ) # Generate content using the base64 encoded PDF curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent?key= $GOOGLE_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [{ "parts":[ {"inline_data": {"mime_type": "application/pdf", "data": "' " $ENCODED_PDF " '"}}, {"text": "' $PROMPT '"} ] }] }' 2 > /dev/null > response.json cat response.json echo jq ".candidates[].content.parts[].text" response.json # Clean up the downloaded PDF rm " ${ DISPLAY_NAME } .pdf" You can also read a PDF from a local file for processing: Python from google import genai from google.genai import types import pathlib client = genai . Client () # Retrieve and encode the PDF byte filepath = pathlib . Path ( 'file.pdf' ) prompt = "Summarize this document" response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = [ types . Part . from_bytes ( data = filepath . read_bytes (), mime_type = 'application/pdf' , ), prompt ]) print ( response . text ) JavaScript import { GoogleGenAI } from "@google/genai" ; import * as fs from 'fs' ; const ai = new GoogleGenAI ({ apiKey : "GEMINI_API_KEY" }); async function main () { const contents = [ { text : "Summarize this document" }, { inlineData : { mimeType : 'application/pdf' , data : Buffer . from ( fs . readFileSync ( "content/343019_3_art_0_py4t4l_convrt.pdf" )). toString ( "base64" ) } } ]; const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : contents }); console . log ( response . text ); } main (); Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , _ := genai . NewClient ( ctx , & genai . ClientConfig { APIKey : os . Getenv ( "GEMINI_API_KEY" ), Backend : genai . BackendGeminiAPI , }) pdfBytes , _ := os . ReadFile ( "path/to/your/file.pdf" ) parts := [] * genai . Part { & genai . Part { InlineData : & genai . Blob { MIMEType : "application/pdf" , Data : pdfBytes , }, }, genai . NewPartFromText ( "Summarize this document" ), } contents := [] * genai . Content { genai . NewContentFromParts ( parts , genai . RoleUser ), } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , contents , nil , ) fmt . Println ( result . Text ()) } Uploading PDFs using the File API You can use the File API to upload larger documents. Always use the File API when the total request size (including the files, text prompt, system instructions, etc.) is larger than 20MB. Note: The File API lets you store up to 50MB of PDF files. Files are stored for 48 hours. You can access them in that period with your API key, but you can't download them from the API. The File API is available at no cost in all regions where the Gemini API is available. Call media.upload to upload a file using the File API. The following code uploads a document file and then uses the file in a call to models.generateContent . Large PDFs from URLs Use the File API to simplify uploading and processing large PDF files from URLs: Python from google import genai from google.genai import types import io import httpx client = genai . Client () long_context_pdf_path = "https://www.nasa.gov/wp-content/uploads/static/history/alsj/a17/A17_FlightPlan.pdf" # Retrieve and upload the PDF using the File API doc_io = io . BytesIO ( httpx . get ( long_context_pdf_path ) . content ) sample_doc = client . files . upload ( # You can pass a path or a file-like object here file = doc_io , config = dict ( mime_type = 'application/pdf' ) ) prompt = "Summarize this document" response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = [ sample_doc , prompt ]) print ( response . text ) JavaScript import { createPartFromUri , GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({ apiKey : "GEMINI_API_KEY" }); async function main () { const pdfBuffer = await fetch ( "https://www.nasa.gov/wp-content/uploads/static/history/alsj/a17/A17_FlightPlan.pdf" ) . then (( response ) = > response . arrayBuffer ()); const fileBlob = new Blob ([ pdfBuffer ], { type : 'application/pdf' }); const file = await ai . files . upload ({ file : fileBlob , config : { displayName : 'A17_FlightPlan.pdf' , }, }); // Wait for the file to be processed. let getFile = await ai . files . get ({ name : file . name }); while ( getFile . state === 'PROCESSING' ) { getFile = await ai . files . get ({ name : file . name }); console . log ( `current file status: ${ getFile . state } ` ); console . log ( 'File is still processing, retrying in 5 seconds' ); await new Promise (( resolve ) = > { setTimeout ( resolve , 5000 ); }); } if ( file . state === 'FAILED' ) { throw new Error ( 'File processing failed.' ); } // Add the file to the contents. const content = [ 'Summarize this document' , ]; if ( file . uri && file . mimeType ) { const fileContent = createPartFromUri ( file . uri , file . mimeType ); content . push ( fileContent ); } const response = await ai . models . generateContent ({ model : 'gemini-2.5-flash' , contents : content , }); console . log ( response . text ); } main (); Go package main import ( "context" "fmt" "io" "net/http" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , _ := genai . NewClient ( ctx , & genai . ClientConfig { APIKey : os . Getenv ( "GEMINI_API_KEY" ), Backend : genai . BackendGeminiAPI , }) pdfURL := "https://www.nasa.gov/wp-content/uploads/static/history/alsj/a17/A17_FlightPlan.pdf" localPdfPath := "A17_FlightPlan_downloaded.pdf" respHttp , _ := http . Get ( pdfURL ) defer respHttp . Body . Close () outFile , _ := os . Create ( localPdfPath ) defer outFile . Close () _ , _ = io . Copy ( outFile , respHttp . Body ) uploadConfig := & genai . UploadFileConfig { MIMEType : "application/pdf" } uploadedFile , _ := client . Files . UploadFromPath ( ctx , localPdfPath , uploadConfig ) promptParts := [] * genai . Part { genai . NewPartFromURI ( uploadedFile . URI , uploadedFile . MIMEType ), genai . NewPartFromText ( "Summarize this document" ), } contents := [] * genai . Content { genai . NewContentFromParts ( promptParts , genai . RoleUser ), // Specify role } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , contents , nil , ) fmt . Println ( result . Text ()) } REST PDF_PATH = "https://www.nasa.gov/wp-content/uploads/static/history/alsj/a17/A17_FlightPlan.pdf" DISPLAY_NAME = "A17_FlightPlan" PROMPT = "Summarize this document" # Download the PDF from the provided URL wget -O " ${ DISPLAY_NAME } .pdf" " ${ PDF_PATH } " MIME_TYPE = $( file -b --mime-type " ${ DISPLAY_NAME } .pdf" ) NUM_BYTES = $( wc -c < " ${ DISPLAY_NAME } .pdf" ) echo "MIME_TYPE: ${ MIME_TYPE } " echo "NUM_BYTES: ${ NUM_BYTES } " tmp_header_file = upload-header.tmp # Initial resumable request defining metadata. # The upload url is in the response headers dump them to a file. curl " ${ BASE_URL } /upload/v1beta/files?key= ${ GOOGLE_API_KEY } " \ -D upload-header.tmp \ -H "X-Goog-Upload-Protocol: resumable" \ -H "X-Goog-Upload-Command: start" \ -H "X-Goog-Upload-Header-Content-Length: ${ NUM_BYTES } " \ -H "X-Goog-Upload-Header-Content-Type: ${ MIME_TYPE } " \ -H "Content-Type: application/json" \ -d "{'file': {'display_name': ' ${ DISPLAY_NAME } '}}" 2 > /dev/null upload_url = $( grep -i "x-goog-upload-url: " " ${ tmp_header_file } " | cut -d " " -f2 | tr -d "\r" ) rm " ${ tmp_header_file } " # Upload the actual bytes. curl " ${ upload_url } " \ -H "Content-Length: ${ NUM_BYTES } " \ -H "X-Goog-Upload-Offset: 0" \ -H "X-Goog-Upload-Command: upload, finalize" \ --data-binary "@ ${ DISPLAY_NAME } .pdf" 2 > /dev/null > file_info.json file_uri = $( jq ".file.uri" file_info.json ) echo "file_uri: ${ file_uri } " # Now generate content using that file curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent?key= $GOOGLE_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [{ "parts":[ {"text": "' $PROMPT '"}, {"file_data":{"mime_type": "application/pdf", "file_uri": ' $file_uri '}}] }] }' 2 > /dev/null > response.json cat response.json echo jq ".candidates[].content.parts[].text" response.json # Clean up the downloaded PDF rm " ${ DISPLAY_NAME } .pdf" Large PDFs stored locally Python from google import genai from google.genai import types import pathlib import httpx client = genai . Client () # Retrieve and encode the PDF byte file_path = pathlib . Path ( 'large_file.pdf' ) # Upload the PDF using the File API sample_file = client . files . upload ( file = file_path , ) prompt = "Summarize this document" response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = [ sample_file , "Summarize this document" ]) print ( response . text ) JavaScript import { createPartFromUri , GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({ apiKey : "GEMINI_API_KEY" }); async function main () { const file = await ai . files . upload ({ file : 'path-to-localfile.pdf' config : { displayName : 'A17_FlightPlan.pdf' , }, }); // Wait for the file to be processed. let getFile = await ai . files . get ({ name : file . name }); while ( getFile . state === 'PROCESSING' ) { getFile = await ai . files . get ({ name : file . name }); console . log ( `current file status: ${ getFile . state } ` ); console . log ( 'File is still processing, retrying in 5 seconds' ); await new Promise (( resolve ) = > { setTimeout ( resolve , 5000 ); }); } if ( file . state === 'FAILED' ) { throw new Error ( 'File processing failed.' ); } // Add the file to the contents. const content = [ 'Summarize this document' , ]; if ( file . uri && file . mimeType ) { const fileContent = createPartFromUri ( file . uri , file . mimeType ); content . push ( fileContent ); } const response = await ai . models . generateContent ({ model : 'gemini-2.5-flash' , contents : content , }); console . log ( response . text ); } main (); Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , _ := genai . NewClient ( ctx , & genai . ClientConfig { APIKey : os . Getenv ( "GEMINI_API_KEY" ), Backend : genai . BackendGeminiAPI , }) localPdfPath := "/path/to/file.pdf" uploadConfig := & genai . UploadFileConfig { MIMEType : "application/pdf" } uploadedFile , _ := client . Files . UploadFromPath ( ctx , localPdfPath , uploadConfig ) promptParts := [] * genai . Part { genai . NewPartFromURI ( uploadedFile . URI , uploadedFile . MIMEType ), genai . NewPartFromText ( "Give me a summary of this pdf file." ), } contents := [] * genai . Content { genai . NewContentFromParts ( promptParts , genai . RoleUser ), } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , contents , nil , ) fmt . Println ( result . Text ()) } REST NUM_BYTES = $( wc -c < " ${ PDF_PATH } " ) DISPLAY_NAME = TEXT tmp_header_file = upload-header.tmp # Initial resumable request defining metadata. # The upload url is in the response headers dump them to a file. curl " ${ BASE_URL } /upload/v1beta/files?key= ${ GEMINI_API_KEY } " \ -D upload-header.tmp \ -H "X-Goog-Upload-Protocol: resumable" \ -H "X-Goog-Upload-Command: start" \ -H "X-Goog-Upload-Header-Content-Length: ${ NUM_BYTES } " \ -H "X-Goog-Upload-Header-Content-Type: application/pdf" \ -H "Content-Type: application/json" \ -d "{'file': {'display_name': ' ${ DISPLAY_NAME } '}}" 2 > /dev/null upload_url = $( grep -i "x-goog-upload-url: " " ${ tmp_header_file } " | cut -d " " -f2 | tr -d "\r" ) rm " ${ tmp_header_file } " # Upload the actual bytes. curl " ${ upload_url } " \ -H "Content-Length: ${ NUM_BYTES } " \ -H "X-Goog-Upload-Offset: 0" \ -H "X-Goog-Upload-Command: upload, finalize" \ --data-binary "@ ${ PDF_PATH } " 2 > /dev/null > file_info.json file_uri = $( jq ".file.uri" file_info.json ) echo file_uri = $file_uri # Now generate content using that file curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent?key= $GOOGLE_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [{ "parts":[ {"text": "Can you add a few more lines to this poem?"}, {"file_data":{"mime_type": "application/pdf", "file_uri": ' $file_uri '}}] }] }' 2 > /dev/null > response.json cat response.json echo jq ".candidates[].content.parts[].text" response.json You can verify the API successfully stored the uploaded file and get its metadata by calling files.get . Only the name (and by extension, the uri ) are unique. Python from google import genai import pathlib client = genai . Client () fpath = pathlib . Path ( 'example.txt' ) fpath . write_text ( 'hello' ) file = client . files . upload ( file = 'example.txt' ) file_info = client . files . get ( name = file . name ) print ( file_info . model_dump_json ( indent = 4 )) REST name = $( jq ".file.name" file_info.json ) # Get the file of interest to check state curl https://generativelanguage.googleapis.com/v1beta/files/ $name > file_info.json # Print some information about the file you got name = $( jq ".file.name" file_info.json ) echo name = $name file_uri = $( jq ".file.uri" file_info.json ) echo file_uri = $file_uri Passing multiple PDFs The Gemini API is capable of processing multiple PDF documents (up to 1000 pages) in a single request, as long as the combined size of the documents and the text prompt stays within the model's context window. Python from google import genai import io import httpx client = genai . Client () doc_url_1 = "https://arxiv.org/pdf/2312.11805" doc_url_2 = "https://arxiv.org/pdf/2403.05530" # Retrieve and upload both PDFs using the File API doc_data_1 = io . BytesIO ( httpx . get ( doc_url_1 ) . content ) doc_data_2 = io . BytesIO ( httpx . get ( doc_url_2 ) . content ) sample_pdf_1 = client . files . upload ( file = doc_data_1 , config = dict ( mime_type = 'application/pdf' ) ) sample_pdf_2 = client . files . upload ( file = doc_data_2 , config = dict ( mime_type = 'application/pdf' ) ) prompt = "What is the difference between each of the main benchmarks between these two papers? Output these in a table." response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = [ sample_pdf_1 , sample_pdf_2 , prompt ]) print ( response . text ) JavaScript import { createPartFromUri , GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({ apiKey : "GEMINI_API_KEY" }); async function uploadRemotePDF ( url , displayName ) { const pdfBuffer = await fetch ( url ) . then (( response ) = > response . arrayBuffer ()); const fileBlob = new Blob ([ pdfBuffer ], { type : 'application/pdf' }); const file = await ai . files . upload ({ file : fileBlob , config : { displayName : displayName , }, }); // Wait for the file to be processed. let getFile = await ai . files . get ({ name : file . name }); while ( getFile . state === 'PROCESSING' ) { getFile = await ai . files . get ({ name : file . name }); console . log ( `current file status: ${ getFile . state } ` ); console . log ( 'File is still processing, retrying in 5 seconds' ); await new Promise (( resolve ) = > { setTimeout ( resolve , 5000 ); }); } if ( file . state === 'FAILED' ) { throw new Error ( 'File processing failed.' ); } return file ; } async function main () { const content = [ 'What is the difference between each of the main benchmarks between these two papers? Output these in a table.' , ]; let file1 = await uploadRemotePDF ( "https://arxiv.org/pdf/2312.11805" , "PDF 1" ) if ( file1 . uri && file1 . mimeType ) { const fileContent = createPartFromUri ( file1 . uri , file1 . mimeType ); content . push ( fileContent ); } let file2 = await uploadRemotePDF ( "https://arxiv.org/pdf/2403.05530" , "PDF 2" ) if ( file2 . uri && file2 . mimeType ) { const fileContent = createPartFromUri ( file2 . uri , file2 . mimeType ); content . push ( fileContent ); } const response = await ai . models . generateContent ({ model : 'gemini-2.5-flash' , contents : content , }); console . log ( response . text ); } main (); Go package main import ( "context" "fmt" "io" "net/http" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , _ := genai . NewClient ( ctx , & genai . ClientConfig { APIKey : os . Getenv ( "GEMINI_API_KEY" ), Backend : genai . BackendGeminiAPI , }) docUrl1 := "https://arxiv.org/pdf/2312.11805" docUrl2 := "https://arxiv.org/pdf/2403.05530" localPath1 := "doc1_downloaded.pdf" localPath2 := "doc2_downloaded.pdf" respHttp1 , _ := http . Get ( docUrl1 ) defer respHttp1 . Body . Close () outFile1 , _ := os . Create ( localPath1 ) _ , _ = io . Copy ( outFile1 , respHttp1 . Body ) outFile1 . Close () respHttp2 , _ := http . Get ( docUrl2 ) defer respHttp2 . Body . Close () outFile2 , _ := os . Create ( localPath2 ) _ , _ = io . Copy ( outFile2 , respHttp2 . Body ) outFile2 . Close () uploadConfig1 := & genai . UploadFileConfig { MIMEType : "application/pdf" } uploadedFile1 , _ := client . Files . UploadFromPath ( ctx , localPath1 , uploadConfig1 ) uploadConfig2 := & genai . UploadFileConfig { MIMEType : "application/pdf" } uploadedFile2 , _ := client . Files . UploadFromPath ( ctx , localPath2 , uploadConfig2 ) promptParts := [] * genai . Part { genai . NewPartFromURI ( uploadedFile1 . URI , uploadedFile1 . MIMEType ), genai . NewPartFromURI ( uploadedFile2 . URI , uploadedFile2 . MIMEType ), genai . NewPartFromText ( "What is the difference between each of the " + "main benchmarks between these two papers? " + "Output these in a table." ), } contents := [] * genai . Content { genai . NewContentFromParts ( promptParts , genai . RoleUser ), } modelName := "gemini-2.5-flash" result , _ := client . Models . GenerateContent ( ctx , modelName , contents , nil , ) fmt . Println ( result . Text ()) } REST DOC_URL_1 = "https://arxiv.org/pdf/2312.11805" DOC_URL_2 = "https://arxiv.org/pdf/2403.05530" DISPLAY_NAME_1 = "Gemini_paper" DISPLAY_NAME_2 = "Gemini_1.5_paper" PROMPT = "What is the difference between each of the main benchmarks between these two papers? Output these in a table." # Function to download and upload a PDF upload_pdf () { local doc_url = " $1 " local display_name = " $2 " # Download the PDF wget -O " ${ display_name } .pdf" " ${ doc_url } " local MIME_TYPE = $( file -b --mime-type " ${ display_name } .pdf" ) local NUM_BYTES = $( wc -c < " ${ display_name } .pdf" ) echo "MIME_TYPE: ${ MIME_TYPE } " echo "NUM_BYTES: ${ NUM_BYTES } " local tmp_header_file = upload-header.tmp # Initial resumable request curl " ${ BASE_URL } /upload/v1beta/files?key= ${ GOOGLE_API_KEY } " \ -D " ${ tmp_header_file } " \ -H "X-Goog-Upload-Protocol: resumable" \ -H "X-Goog-Upload-Command: start" \ -H "X-Goog-Upload-Header-Content-Length: ${ NUM_BYTES } " \ -H "X-Goog-Upload-Header-Content-Type: ${ MIME_TYPE } " \ -H "Content-Type: application/json" \ -d "{'file': {'display_name': ' ${ display_name } '}}" 2 > /dev/null local upload_url = $( grep -i "x-goog-upload-url: " " ${ tmp_header_file } " | cut -d " " -f2 | tr -d "\r" ) rm " ${ tmp_header_file } " # Upload the PDF curl " ${ upload_url } " \ -H "Content-Length: ${ NUM_BYTES } " \ -H "X-Goog-Upload-Offset: 0" \ -H "X-Goog-Upload-Command: upload, finalize" \ --data-binary "@ ${ display_name } .pdf" 2 > /dev/null > "file_info_ ${ display_name } .json" local file_uri = $( jq ".file.uri" "file_info_ ${ display_name } .json" ) echo "file_uri for ${ display_name } : ${ file_uri } " # Clean up the downloaded PDF rm " ${ display_name } .pdf" echo " ${ file_uri } " } # Upload the first PDF file_uri_1 = $( upload_pdf " ${ DOC_URL_1 } " " ${ DISPLAY_NAME_1 } " ) # Upload the second PDF file_uri_2 = $( upload_pdf " ${ DOC_URL_2 } " " ${ DISPLAY_NAME_2 } " ) # Now generate content using both files curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent?key= $GOOGLE_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [{ "parts":[ {"file_data": {"mime_type": "application/pdf", "file_uri": ' $file_uri_1 '}}, {"file_data": {"mime_type": "application/pdf", "file_uri": ' $file_uri_2 '}}, {"text": "' $PROMPT '"} ] }] }' 2 > /dev/null > response.json cat response.json echo jq ".candidates[].content.parts[].text" response.json Technical details Gemini supports a maximum of 1,000 document pages. Each document page is equivalent to 258 tokens. While there are no specific limits to the number of pixels in a document besides the model's context window , larger pages are scaled down to a maximum resolution of 3072x3072 while preserving their original aspect ratio, while smaller pages are scaled up to 768x768 pixels. There is no cost reduction for pages at lower sizes, other than bandwidth, or performance improvement for pages at higher resolution. Document types Technically, you can pass other MIME types for document understanding, like TXT, Markdown, HTML, XML, etc. However, document vision only meaningfully understands PDFs . Other types will be extracted as pure text, and the model won't be able to interpret what we see in the rendering of those files. Any file-type specifics like charts, diagrams, HTML tags, Markdown formatting, etc., will be lost. Best practices For best results: Rotate pages to the correct orientation before uploading. Avoid blurry pages. If using a single page, place the text prompt after the page. What's next To learn more, see the following resources: File prompting strategies : The Gemini API supports prompting with text, image, audio, and video data, also known as multimodal prompting. System instructions : System instructions let you steer the behavior of the model based on your specific needs and use cases. Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-09 UTC. \ No newline at end of file diff --git a/text_content/docs_document-processing_9e479485.txt b/text_content/docs_document-processing_9e479485.txt new file mode 100644 index 0000000000000000000000000000000000000000..8e4e0b834ad0e22e261ef511247b7d8f6ac7d3c6 --- /dev/null +++ b/text_content/docs_document-processing_9e479485.txt @@ -0,0 +1,5 @@ +URL: https://ai.google.dev/gemini-api/docs/document-processing +Title: Document understanding  |  Gemini API  |  Google AI for Developers +================================================== + +Document understanding | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Document understanding Gemini models can process documents in PDF format, using native vision to understand entire document contexts. This goes beyond simple text extraction, allowing Gemini to: Analyze and interpret content, including text, images, diagrams, charts, and tables, even in long documents up to 1000 pages. Extract information into structured output formats. Summarize and answer questions based on both the visual and textual elements in a document. Transcribe document content (e.g. to HTML), preserving layouts and formatting, for use in downstream applications. Passing inline PDF data You can pass inline PDF data in the request to generateContent . For PDF payloads under 20MB, you can choose between uploading base64 encoded documents or directly uploading locally stored files. The following example shows you how to fetch a PDF from a URL and convert it to bytes for processing: Python from google import genai from google.genai import types import httpx client = genai . Client () doc_url = "https://discovery.ucl.ac.uk/id/eprint/10089234/1/343019_3_art_0_py4t4l_convrt.pdf" # Retrieve and encode the PDF byte doc_data = httpx . get ( doc_url ) . content prompt = "Summarize this document" response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = [ types . Part . from_bytes ( data = doc_data , mime_type = 'application/pdf' , ), prompt ]) print ( response . text ) JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({ apiKey : "GEMINI_API_KEY" }); async function main () { const pdfResp = await fetch ( 'https://discovery.ucl.ac.uk/id/eprint/10089234/1/343019_3_art_0_py4t4l_convrt.pdf' ) . then (( response ) = > response . arrayBuffer ()); const contents = [ { text : "Summarize this document" }, { inlineData : { mimeType : 'application/pdf' , data : Buffer . from ( pdfResp ). toString ( "base64" ) } } ]; const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : contents }); console . log ( response . text ); } main (); Go package main import ( "context" "fmt" "io" "net/http" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , _ := genai . NewClient ( ctx , & genai . ClientConfig { APIKey : os . Getenv ( "GEMINI_API_KEY" ), Backend : genai . BackendGeminiAPI , }) pdfResp , _ := http . Get ( "https://discovery.ucl.ac.uk/id/eprint/10089234/1/343019_3_art_0_py4t4l_convrt.pdf" ) var pdfBytes [] byte if pdfResp != nil && pdfResp . Body != nil { pdfBytes , _ = io . ReadAll ( pdfResp . Body ) pdfResp . Body . Close () } parts := [] * genai . Part { & genai . Part { InlineData : & genai . Blob { MIMEType : "application/pdf" , Data : pdfBytes , }, }, genai . NewPartFromText ( "Summarize this document" ), } contents := [] * genai . Content { genai . NewContentFromParts ( parts , genai . RoleUser ), } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , contents , nil , ) fmt . Println ( result . Text ()) } REST DOC_URL = "https://discovery.ucl.ac.uk/id/eprint/10089234/1/343019_3_art_0_py4t4l_convrt.pdf" PROMPT = "Summarize this document" DISPLAY_NAME = "base64_pdf" # Download the PDF wget -O " ${ DISPLAY_NAME } .pdf" " ${ DOC_URL } " # Check for FreeBSD base64 and set flags accordingly if [[ " $( base64 --version 2>&1 ) " = * "FreeBSD" * ]] ; then B64FLAGS = "--input" else B64FLAGS = "-w0" fi # Base64 encode the PDF ENCODED_PDF = $( base64 $B64FLAGS " ${ DISPLAY_NAME } .pdf" ) # Generate content using the base64 encoded PDF curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent?key= $GOOGLE_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [{ "parts":[ {"inline_data": {"mime_type": "application/pdf", "data": "' " $ENCODED_PDF " '"}}, {"text": "' $PROMPT '"} ] }] }' 2 > /dev/null > response.json cat response.json echo jq ".candidates[].content.parts[].text" response.json # Clean up the downloaded PDF rm " ${ DISPLAY_NAME } .pdf" You can also read a PDF from a local file for processing: Python from google import genai from google.genai import types import pathlib client = genai . Client () # Retrieve and encode the PDF byte filepath = pathlib . Path ( 'file.pdf' ) prompt = "Summarize this document" response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = [ types . Part . from_bytes ( data = filepath . read_bytes (), mime_type = 'application/pdf' , ), prompt ]) print ( response . text ) JavaScript import { GoogleGenAI } from "@google/genai" ; import * as fs from 'fs' ; const ai = new GoogleGenAI ({ apiKey : "GEMINI_API_KEY" }); async function main () { const contents = [ { text : "Summarize this document" }, { inlineData : { mimeType : 'application/pdf' , data : Buffer . from ( fs . readFileSync ( "content/343019_3_art_0_py4t4l_convrt.pdf" )). toString ( "base64" ) } } ]; const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : contents }); console . log ( response . text ); } main (); Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , _ := genai . NewClient ( ctx , & genai . ClientConfig { APIKey : os . Getenv ( "GEMINI_API_KEY" ), Backend : genai . BackendGeminiAPI , }) pdfBytes , _ := os . ReadFile ( "path/to/your/file.pdf" ) parts := [] * genai . Part { & genai . Part { InlineData : & genai . Blob { MIMEType : "application/pdf" , Data : pdfBytes , }, }, genai . NewPartFromText ( "Summarize this document" ), } contents := [] * genai . Content { genai . NewContentFromParts ( parts , genai . RoleUser ), } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , contents , nil , ) fmt . Println ( result . Text ()) } Uploading PDFs using the File API You can use the File API to upload larger documents. Always use the File API when the total request size (including the files, text prompt, system instructions, etc.) is larger than 20MB. Note: The File API lets you store up to 50MB of PDF files. Files are stored for 48 hours. You can access them in that period with your API key, but you can't download them from the API. The File API is available at no cost in all regions where the Gemini API is available. Call media.upload to upload a file using the File API. The following code uploads a document file and then uses the file in a call to models.generateContent . Large PDFs from URLs Use the File API to simplify uploading and processing large PDF files from URLs: Python from google import genai from google.genai import types import io import httpx client = genai . Client () long_context_pdf_path = "https://www.nasa.gov/wp-content/uploads/static/history/alsj/a17/A17_FlightPlan.pdf" # Retrieve and upload the PDF using the File API doc_io = io . BytesIO ( httpx . get ( long_context_pdf_path ) . content ) sample_doc = client . files . upload ( # You can pass a path or a file-like object here file = doc_io , config = dict ( mime_type = 'application/pdf' ) ) prompt = "Summarize this document" response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = [ sample_doc , prompt ]) print ( response . text ) JavaScript import { createPartFromUri , GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({ apiKey : "GEMINI_API_KEY" }); async function main () { const pdfBuffer = await fetch ( "https://www.nasa.gov/wp-content/uploads/static/history/alsj/a17/A17_FlightPlan.pdf" ) . then (( response ) = > response . arrayBuffer ()); const fileBlob = new Blob ([ pdfBuffer ], { type : 'application/pdf' }); const file = await ai . files . upload ({ file : fileBlob , config : { displayName : 'A17_FlightPlan.pdf' , }, }); // Wait for the file to be processed. let getFile = await ai . files . get ({ name : file . name }); while ( getFile . state === 'PROCESSING' ) { getFile = await ai . files . get ({ name : file . name }); console . log ( `current file status: ${ getFile . state } ` ); console . log ( 'File is still processing, retrying in 5 seconds' ); await new Promise (( resolve ) = > { setTimeout ( resolve , 5000 ); }); } if ( file . state === 'FAILED' ) { throw new Error ( 'File processing failed.' ); } // Add the file to the contents. const content = [ 'Summarize this document' , ]; if ( file . uri && file . mimeType ) { const fileContent = createPartFromUri ( file . uri , file . mimeType ); content . push ( fileContent ); } const response = await ai . models . generateContent ({ model : 'gemini-2.5-flash' , contents : content , }); console . log ( response . text ); } main (); Go package main import ( "context" "fmt" "io" "net/http" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , _ := genai . NewClient ( ctx , & genai . ClientConfig { APIKey : os . Getenv ( "GEMINI_API_KEY" ), Backend : genai . BackendGeminiAPI , }) pdfURL := "https://www.nasa.gov/wp-content/uploads/static/history/alsj/a17/A17_FlightPlan.pdf" localPdfPath := "A17_FlightPlan_downloaded.pdf" respHttp , _ := http . Get ( pdfURL ) defer respHttp . Body . Close () outFile , _ := os . Create ( localPdfPath ) defer outFile . Close () _ , _ = io . Copy ( outFile , respHttp . Body ) uploadConfig := & genai . UploadFileConfig { MIMEType : "application/pdf" } uploadedFile , _ := client . Files . UploadFromPath ( ctx , localPdfPath , uploadConfig ) promptParts := [] * genai . Part { genai . NewPartFromURI ( uploadedFile . URI , uploadedFile . MIMEType ), genai . NewPartFromText ( "Summarize this document" ), } contents := [] * genai . Content { genai . NewContentFromParts ( promptParts , genai . RoleUser ), // Specify role } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , contents , nil , ) fmt . Println ( result . Text ()) } REST PDF_PATH = "https://www.nasa.gov/wp-content/uploads/static/history/alsj/a17/A17_FlightPlan.pdf" DISPLAY_NAME = "A17_FlightPlan" PROMPT = "Summarize this document" # Download the PDF from the provided URL wget -O " ${ DISPLAY_NAME } .pdf" " ${ PDF_PATH } " MIME_TYPE = $( file -b --mime-type " ${ DISPLAY_NAME } .pdf" ) NUM_BYTES = $( wc -c < " ${ DISPLAY_NAME } .pdf" ) echo "MIME_TYPE: ${ MIME_TYPE } " echo "NUM_BYTES: ${ NUM_BYTES } " tmp_header_file = upload-header.tmp # Initial resumable request defining metadata. # The upload url is in the response headers dump them to a file. curl " ${ BASE_URL } /upload/v1beta/files?key= ${ GOOGLE_API_KEY } " \ -D upload-header.tmp \ -H "X-Goog-Upload-Protocol: resumable" \ -H "X-Goog-Upload-Command: start" \ -H "X-Goog-Upload-Header-Content-Length: ${ NUM_BYTES } " \ -H "X-Goog-Upload-Header-Content-Type: ${ MIME_TYPE } " \ -H "Content-Type: application/json" \ -d "{'file': {'display_name': ' ${ DISPLAY_NAME } '}}" 2 > /dev/null upload_url = $( grep -i "x-goog-upload-url: " " ${ tmp_header_file } " | cut -d " " -f2 | tr -d "\r" ) rm " ${ tmp_header_file } " # Upload the actual bytes. curl " ${ upload_url } " \ -H "Content-Length: ${ NUM_BYTES } " \ -H "X-Goog-Upload-Offset: 0" \ -H "X-Goog-Upload-Command: upload, finalize" \ --data-binary "@ ${ DISPLAY_NAME } .pdf" 2 > /dev/null > file_info.json file_uri = $( jq ".file.uri" file_info.json ) echo "file_uri: ${ file_uri } " # Now generate content using that file curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent?key= $GOOGLE_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [{ "parts":[ {"text": "' $PROMPT '"}, {"file_data":{"mime_type": "application/pdf", "file_uri": ' $file_uri '}}] }] }' 2 > /dev/null > response.json cat response.json echo jq ".candidates[].content.parts[].text" response.json # Clean up the downloaded PDF rm " ${ DISPLAY_NAME } .pdf" Large PDFs stored locally Python from google import genai from google.genai import types import pathlib import httpx client = genai . Client () # Retrieve and encode the PDF byte file_path = pathlib . Path ( 'large_file.pdf' ) # Upload the PDF using the File API sample_file = client . files . upload ( file = file_path , ) prompt = "Summarize this document" response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = [ sample_file , "Summarize this document" ]) print ( response . text ) JavaScript import { createPartFromUri , GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({ apiKey : "GEMINI_API_KEY" }); async function main () { const file = await ai . files . upload ({ file : 'path-to-localfile.pdf' config : { displayName : 'A17_FlightPlan.pdf' , }, }); // Wait for the file to be processed. let getFile = await ai . files . get ({ name : file . name }); while ( getFile . state === 'PROCESSING' ) { getFile = await ai . files . get ({ name : file . name }); console . log ( `current file status: ${ getFile . state } ` ); console . log ( 'File is still processing, retrying in 5 seconds' ); await new Promise (( resolve ) = > { setTimeout ( resolve , 5000 ); }); } if ( file . state === 'FAILED' ) { throw new Error ( 'File processing failed.' ); } // Add the file to the contents. const content = [ 'Summarize this document' , ]; if ( file . uri && file . mimeType ) { const fileContent = createPartFromUri ( file . uri , file . mimeType ); content . push ( fileContent ); } const response = await ai . models . generateContent ({ model : 'gemini-2.5-flash' , contents : content , }); console . log ( response . text ); } main (); Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , _ := genai . NewClient ( ctx , & genai . ClientConfig { APIKey : os . Getenv ( "GEMINI_API_KEY" ), Backend : genai . BackendGeminiAPI , }) localPdfPath := "/path/to/file.pdf" uploadConfig := & genai . UploadFileConfig { MIMEType : "application/pdf" } uploadedFile , _ := client . Files . UploadFromPath ( ctx , localPdfPath , uploadConfig ) promptParts := [] * genai . Part { genai . NewPartFromURI ( uploadedFile . URI , uploadedFile . MIMEType ), genai . NewPartFromText ( "Give me a summary of this pdf file." ), } contents := [] * genai . Content { genai . NewContentFromParts ( promptParts , genai . RoleUser ), } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , contents , nil , ) fmt . Println ( result . Text ()) } REST NUM_BYTES = $( wc -c < " ${ PDF_PATH } " ) DISPLAY_NAME = TEXT tmp_header_file = upload-header.tmp # Initial resumable request defining metadata. # The upload url is in the response headers dump them to a file. curl " ${ BASE_URL } /upload/v1beta/files?key= ${ GEMINI_API_KEY } " \ -D upload-header.tmp \ -H "X-Goog-Upload-Protocol: resumable" \ -H "X-Goog-Upload-Command: start" \ -H "X-Goog-Upload-Header-Content-Length: ${ NUM_BYTES } " \ -H "X-Goog-Upload-Header-Content-Type: application/pdf" \ -H "Content-Type: application/json" \ -d "{'file': {'display_name': ' ${ DISPLAY_NAME } '}}" 2 > /dev/null upload_url = $( grep -i "x-goog-upload-url: " " ${ tmp_header_file } " | cut -d " " -f2 | tr -d "\r" ) rm " ${ tmp_header_file } " # Upload the actual bytes. curl " ${ upload_url } " \ -H "Content-Length: ${ NUM_BYTES } " \ -H "X-Goog-Upload-Offset: 0" \ -H "X-Goog-Upload-Command: upload, finalize" \ --data-binary "@ ${ PDF_PATH } " 2 > /dev/null > file_info.json file_uri = $( jq ".file.uri" file_info.json ) echo file_uri = $file_uri # Now generate content using that file curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent?key= $GOOGLE_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [{ "parts":[ {"text": "Can you add a few more lines to this poem?"}, {"file_data":{"mime_type": "application/pdf", "file_uri": ' $file_uri '}}] }] }' 2 > /dev/null > response.json cat response.json echo jq ".candidates[].content.parts[].text" response.json You can verify the API successfully stored the uploaded file and get its metadata by calling files.get . Only the name (and by extension, the uri ) are unique. Python from google import genai import pathlib client = genai . Client () fpath = pathlib . Path ( 'example.txt' ) fpath . write_text ( 'hello' ) file = client . files . upload ( file = 'example.txt' ) file_info = client . files . get ( name = file . name ) print ( file_info . model_dump_json ( indent = 4 )) REST name = $( jq ".file.name" file_info.json ) # Get the file of interest to check state curl https://generativelanguage.googleapis.com/v1beta/files/ $name > file_info.json # Print some information about the file you got name = $( jq ".file.name" file_info.json ) echo name = $name file_uri = $( jq ".file.uri" file_info.json ) echo file_uri = $file_uri Passing multiple PDFs The Gemini API is capable of processing multiple PDF documents (up to 1000 pages) in a single request, as long as the combined size of the documents and the text prompt stays within the model's context window. Python from google import genai import io import httpx client = genai . Client () doc_url_1 = "https://arxiv.org/pdf/2312.11805" doc_url_2 = "https://arxiv.org/pdf/2403.05530" # Retrieve and upload both PDFs using the File API doc_data_1 = io . BytesIO ( httpx . get ( doc_url_1 ) . content ) doc_data_2 = io . BytesIO ( httpx . get ( doc_url_2 ) . content ) sample_pdf_1 = client . files . upload ( file = doc_data_1 , config = dict ( mime_type = 'application/pdf' ) ) sample_pdf_2 = client . files . upload ( file = doc_data_2 , config = dict ( mime_type = 'application/pdf' ) ) prompt = "What is the difference between each of the main benchmarks between these two papers? Output these in a table." response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = [ sample_pdf_1 , sample_pdf_2 , prompt ]) print ( response . text ) JavaScript import { createPartFromUri , GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({ apiKey : "GEMINI_API_KEY" }); async function uploadRemotePDF ( url , displayName ) { const pdfBuffer = await fetch ( url ) . then (( response ) = > response . arrayBuffer ()); const fileBlob = new Blob ([ pdfBuffer ], { type : 'application/pdf' }); const file = await ai . files . upload ({ file : fileBlob , config : { displayName : displayName , }, }); // Wait for the file to be processed. let getFile = await ai . files . get ({ name : file . name }); while ( getFile . state === 'PROCESSING' ) { getFile = await ai . files . get ({ name : file . name }); console . log ( `current file status: ${ getFile . state } ` ); console . log ( 'File is still processing, retrying in 5 seconds' ); await new Promise (( resolve ) = > { setTimeout ( resolve , 5000 ); }); } if ( file . state === 'FAILED' ) { throw new Error ( 'File processing failed.' ); } return file ; } async function main () { const content = [ 'What is the difference between each of the main benchmarks between these two papers? Output these in a table.' , ]; let file1 = await uploadRemotePDF ( "https://arxiv.org/pdf/2312.11805" , "PDF 1" ) if ( file1 . uri && file1 . mimeType ) { const fileContent = createPartFromUri ( file1 . uri , file1 . mimeType ); content . push ( fileContent ); } let file2 = await uploadRemotePDF ( "https://arxiv.org/pdf/2403.05530" , "PDF 2" ) if ( file2 . uri && file2 . mimeType ) { const fileContent = createPartFromUri ( file2 . uri , file2 . mimeType ); content . push ( fileContent ); } const response = await ai . models . generateContent ({ model : 'gemini-2.5-flash' , contents : content , }); console . log ( response . text ); } main (); Go package main import ( "context" "fmt" "io" "net/http" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , _ := genai . NewClient ( ctx , & genai . ClientConfig { APIKey : os . Getenv ( "GEMINI_API_KEY" ), Backend : genai . BackendGeminiAPI , }) docUrl1 := "https://arxiv.org/pdf/2312.11805" docUrl2 := "https://arxiv.org/pdf/2403.05530" localPath1 := "doc1_downloaded.pdf" localPath2 := "doc2_downloaded.pdf" respHttp1 , _ := http . Get ( docUrl1 ) defer respHttp1 . Body . Close () outFile1 , _ := os . Create ( localPath1 ) _ , _ = io . Copy ( outFile1 , respHttp1 . Body ) outFile1 . Close () respHttp2 , _ := http . Get ( docUrl2 ) defer respHttp2 . Body . Close () outFile2 , _ := os . Create ( localPath2 ) _ , _ = io . Copy ( outFile2 , respHttp2 . Body ) outFile2 . Close () uploadConfig1 := & genai . UploadFileConfig { MIMEType : "application/pdf" } uploadedFile1 , _ := client . Files . UploadFromPath ( ctx , localPath1 , uploadConfig1 ) uploadConfig2 := & genai . UploadFileConfig { MIMEType : "application/pdf" } uploadedFile2 , _ := client . Files . UploadFromPath ( ctx , localPath2 , uploadConfig2 ) promptParts := [] * genai . Part { genai . NewPartFromURI ( uploadedFile1 . URI , uploadedFile1 . MIMEType ), genai . NewPartFromURI ( uploadedFile2 . URI , uploadedFile2 . MIMEType ), genai . NewPartFromText ( "What is the difference between each of the " + "main benchmarks between these two papers? " + "Output these in a table." ), } contents := [] * genai . Content { genai . NewContentFromParts ( promptParts , genai . RoleUser ), } modelName := "gemini-2.5-flash" result , _ := client . Models . GenerateContent ( ctx , modelName , contents , nil , ) fmt . Println ( result . Text ()) } REST DOC_URL_1 = "https://arxiv.org/pdf/2312.11805" DOC_URL_2 = "https://arxiv.org/pdf/2403.05530" DISPLAY_NAME_1 = "Gemini_paper" DISPLAY_NAME_2 = "Gemini_1.5_paper" PROMPT = "What is the difference between each of the main benchmarks between these two papers? Output these in a table." # Function to download and upload a PDF upload_pdf () { local doc_url = " $1 " local display_name = " $2 " # Download the PDF wget -O " ${ display_name } .pdf" " ${ doc_url } " local MIME_TYPE = $( file -b --mime-type " ${ display_name } .pdf" ) local NUM_BYTES = $( wc -c < " ${ display_name } .pdf" ) echo "MIME_TYPE: ${ MIME_TYPE } " echo "NUM_BYTES: ${ NUM_BYTES } " local tmp_header_file = upload-header.tmp # Initial resumable request curl " ${ BASE_URL } /upload/v1beta/files?key= ${ GOOGLE_API_KEY } " \ -D " ${ tmp_header_file } " \ -H "X-Goog-Upload-Protocol: resumable" \ -H "X-Goog-Upload-Command: start" \ -H "X-Goog-Upload-Header-Content-Length: ${ NUM_BYTES } " \ -H "X-Goog-Upload-Header-Content-Type: ${ MIME_TYPE } " \ -H "Content-Type: application/json" \ -d "{'file': {'display_name': ' ${ display_name } '}}" 2 > /dev/null local upload_url = $( grep -i "x-goog-upload-url: " " ${ tmp_header_file } " | cut -d " " -f2 | tr -d "\r" ) rm " ${ tmp_header_file } " # Upload the PDF curl " ${ upload_url } " \ -H "Content-Length: ${ NUM_BYTES } " \ -H "X-Goog-Upload-Offset: 0" \ -H "X-Goog-Upload-Command: upload, finalize" \ --data-binary "@ ${ display_name } .pdf" 2 > /dev/null > "file_info_ ${ display_name } .json" local file_uri = $( jq ".file.uri" "file_info_ ${ display_name } .json" ) echo "file_uri for ${ display_name } : ${ file_uri } " # Clean up the downloaded PDF rm " ${ display_name } .pdf" echo " ${ file_uri } " } # Upload the first PDF file_uri_1 = $( upload_pdf " ${ DOC_URL_1 } " " ${ DISPLAY_NAME_1 } " ) # Upload the second PDF file_uri_2 = $( upload_pdf " ${ DOC_URL_2 } " " ${ DISPLAY_NAME_2 } " ) # Now generate content using both files curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent?key= $GOOGLE_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [{ "parts":[ {"file_data": {"mime_type": "application/pdf", "file_uri": ' $file_uri_1 '}}, {"file_data": {"mime_type": "application/pdf", "file_uri": ' $file_uri_2 '}}, {"text": "' $PROMPT '"} ] }] }' 2 > /dev/null > response.json cat response.json echo jq ".candidates[].content.parts[].text" response.json Technical details Gemini supports a maximum of 1,000 document pages. Each document page is equivalent to 258 tokens. While there are no specific limits to the number of pixels in a document besides the model's context window , larger pages are scaled down to a maximum resolution of 3072x3072 while preserving their original aspect ratio, while smaller pages are scaled up to 768x768 pixels. There is no cost reduction for pages at lower sizes, other than bandwidth, or performance improvement for pages at higher resolution. Document types Technically, you can pass other MIME types for document understanding, like TXT, Markdown, HTML, XML, etc. However, document vision only meaningfully understands PDFs . Other types will be extracted as pure text, and the model won't be able to interpret what we see in the rendering of those files. Any file-type specifics like charts, diagrams, HTML tags, Markdown formatting, etc., will be lost. Best practices For best results: Rotate pages to the correct orientation before uploading. Avoid blurry pages. If using a single page, place the text prompt after the page. What's next To learn more, see the following resources: File prompting strategies : The Gemini API supports prompting with text, image, audio, and video data, also known as multimodal prompting. System instructions : System instructions let you steer the behavior of the model based on your specific needs and use cases. Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-09 UTC. \ No newline at end of file diff --git a/text_content/docs_embeddings_b5d471e0.txt b/text_content/docs_embeddings_b5d471e0.txt new file mode 100644 index 0000000000000000000000000000000000000000..9b31d29d68e2ab3d2d5678cc29def00478f5c713 --- /dev/null +++ b/text_content/docs_embeddings_b5d471e0.txt @@ -0,0 +1,5 @@ +URL: https://ai.google.dev/gemini-api/docs/embeddings#use-cases +Title: Embeddings  |  Gemini API  |  Google AI for Developers +================================================== + +Embeddings | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Embeddings Note: Introducing our first Gemini embedding model, available now to developers as gemini-embedding-exp-03-07 in the API. The Gemini API supports several embedding models that generate embeddings for words, phrases, code, and sentences. The resulting embeddings can then be used for tasks such as semantic search, text classification, and clustering, among many others. What are embeddings? Embeddings are numerical representations of text (or other media formats) that capture relationships between inputs. Text embeddings work by converting text into arrays of floating point numbers, called vectors . These vectors are designed to capture the meaning of the text. The length of the embedding array is called the vector's dimensionality . A passage of text might be represented by a vector containing hundreds of dimensions. Embeddings capture semantic meaning and context, which results in text with similar meanings having "closer" embeddings. For example, the sentence "I took my dog to the vet" and "I took my cat to the vet" would have embeddings that are close to each other in the vector space. You can use embeddings to compare different texts and understand how they relate. For example, if the embeddings of the text "cat" and "dog" are close together you can infer that these words are similar in meaning, context, or both. This enables a variety of common AI use cases . Before you begin Before calling the Gemini API, ensure you have your SDK of choice installed, and a Gemini API key configured and ready to use. Generate embeddings Use the embedContent method to generate text embeddings: Python from google import genai client = genai . Client () result = client . models . embed_content ( model = "gemini-embedding-exp-03-07" , contents = "What is the meaning of life?" ) print ( result . embeddings ) JavaScript import { GoogleGenAI } from "@google/genai" ; async function main () { const ai = new GoogleGenAI ({}); const response = await ai . models . embedContent ({ model : 'gemini-embedding-exp-03-07' , contents : 'What is the meaning of life?' , }); console . log ( response . embeddings ); } main (); Go package main import ( "context" "encoding/json" "fmt" "log" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } contents := [] * genai . Content { genai . NewContentFromText ( "What is the meaning of life?" , genai . RoleUser ), } result , err := client . Models . EmbedContent ( ctx , "gemini-embedding-exp-03-07" , contents , nil , ) if err != nil { log . Fatal ( err ) } embeddings , err := json . MarshalIndent ( result . Embeddings , "" , " " ) if err != nil { log . Fatal ( err ) } fmt . Println ( string ( embeddings )) } REST curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-embedding-exp-03-07:embedContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -d '{"model": "models/gemini-embedding-exp-03-07", "content": { "parts":[{ "text": "What is the meaning of life?"}]} }' You can also generate embeddings for multiple chunks at once by passing them in as a list of strings. Task types When building Retrieval Augmented Generation (RAG) systems, a common design is to use text embeddings to perform a similarity search. In some cases this can lead to degraded quality, because questions and their answers are not semantically similar. For example, a question like "Why is the sky blue?" and its answer "The scattering of sunlight causes the blue color," have distinctly different meanings as statements, which means that a RAG system won't automatically recognize their relation. Task types enable you to generate optimized embeddings for specific tasks, saving you time and cost and improving performance. Python from google import genai from google.genai import types client = genai . Client () result = client . models . embed_content ( model = "gemini-embedding-exp-03-07" , contents = "What is the meaning of life?" , config = types . EmbedContentConfig ( task_type = "SEMANTIC_SIMILARITY" ) ) print ( result . embeddings ) JavaScript import { GoogleGenAI } from "@google/genai" ; async function main () { const ai = new GoogleGenAI ({}); const response = await ai . models . embedContent ({ model : 'gemini-embedding-exp-03-07' , contents : 'What is the meaning of life?' , config : { taskType : "SEMANTIC_SIMILARITY" , } }); console . log ( response . embeddings ); } main (); REST curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-embedding-exp-03-07:embedContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -d '{"model": "models/gemini-embedding-exp-03-07", "content": { "parts":[{ "text": "What is the meaning of life?"}]}, "taskType": "SEMANTIC_SIMILARITY" }' Supported task types Task type Description SEMANTIC_SIMILARITY Used to generate embeddings that are optimized to assess text similarity. CLASSIFICATION Used to generate embeddings that are optimized to classify texts according to preset labels. CLUSTERING Used to generate embeddings that are optimized to cluster texts based on their similarities. RETRIEVAL_DOCUMENT , RETRIEVAL_QUERY , QUESTION_ANSWERING , and FACT_VERIFICATION Used to generate embeddings that are optimized for document search or information retrieval. CODE_RETRIEVAL_QUERY Used to retrieve a code block based on a natural language query, such as sort an array or reverse a linked list. Embeddings of the code blocks are computed using RETRIEVAL_DOCUMENT . Use cases Text embeddings are used in a variety of common AI use cases, such as: Information retrieval: You can use embeddings to retrieve semantically similar text given a piece of input text. Document search tutorial task Clustering: Comparing groups of embeddings can help identify hidden trends. Embedding clustering tutorial bubble_chart Vector database: As you take different embedding use cases to production, it is common to store embeddings in a vector database. Vector database tutorial bolt Classification: You can train a model using embeddings to classify documents into categories. Classification tutorial token Embedding models The Gemini API offers three models that generate text embeddings: gemini-embedding-exp-03-07 text-embedding-004 embedding-001 What's next Check out the embeddings quickstart notebook . Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-08 UTC. \ No newline at end of file diff --git a/text_content/docs_function-calling_8995ed36.txt b/text_content/docs_function-calling_8995ed36.txt new file mode 100644 index 0000000000000000000000000000000000000000..2f50f263d7699888b24d8639242ef13d0da1281e --- /dev/null +++ b/text_content/docs_function-calling_8995ed36.txt @@ -0,0 +1,5 @@ +URL: https://ai.google.dev/gemini-api/docs/function-calling#step-4 +Title: Function calling with the Gemini API  |  Google AI for Developers +================================================== + +Function calling with the Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Function calling with the Gemini API Function calling lets you connect models to external tools and APIs. Instead of generating text responses, the model determines when to call specific functions and provides the necessary parameters to execute real-world actions. This allows the model to act as a bridge between natural language and real-world actions and data. Function calling has 3 primary use cases: Augment Knowledge: Access information from external sources like databases, APIs, and knowledge bases. Extend Capabilities: Use external tools to perform computations and extend the limitations of the model, such as using a calculator or creating charts. Take Actions: Interact with external systems using APIs, such as scheduling appointments, creating invoices, sending emails, or controlling smart home devices. Get Weather Schedule Meeting Create Chart How function calling works Function calling involves a structured interaction between your application, the model, and external functions. Here's a breakdown of the process: Define Function Declaration: Define the function declaration in your application code. Function Declarations describe the function's name, parameters, and purpose to the model. Call LLM with function declarations: Send user prompt along with the function declaration(s) to the model. It analyzes the request and determines if a function call would be helpful. If so, it responds with a structured JSON object. Execute Function Code (Your Responsibility): The Model does not execute the function itself. It's your application's responsibility to process the response and check for Function Call, if Yes : Extract the name and args of the function and execute the corresponding function in your application. No: The model has provided a direct text response to the prompt (this flow is less emphasized in the example but is a possible outcome). Create User friendly response: If a function was executed, capture the result and send it back to the model in a subsequent turn of the conversation. It will use the result to generate a final, user-friendly response that incorporates the information from the function call. This process can be repeated over multiple turns, allowing for complex interactions and workflows. The model also supports calling multiple functions in a single turn ( parallel function calling ) and in sequence ( compositional function calling ). Step 1: Define a function declaration Define a function and its declaration within your application code that allows users to set light values and make an API request. This function could call external services or APIs. Python # Define a function that the model can call to control smart lights set_light_values_declaration = { "name" : "set_light_values" , "description" : "Sets the brightness and color temperature of a light." , "parameters" : { "type" : "object" , "properties" : { "brightness" : { "type" : "integer" , "description" : "Light level from 0 to 100. Zero is off and 100 is full brightness" , }, "color_temp" : { "type" : "string" , "enum" : [ "daylight" , "cool" , "warm" ], "description" : "Color temperature of the light fixture, which can be `daylight`, `cool` or `warm`." , }, }, "required" : [ "brightness" , "color_temp" ], }, } # This is the actual function that would be called based on the model's suggestion def set_light_values ( brightness : int , color_temp : str ) - > dict [ str , int | str ]: """Set the brightness and color temperature of a room light. (mock API). Args: brightness: Light level from 0 to 100. Zero is off and 100 is full brightness color_temp: Color temperature of the light fixture, which can be `daylight`, `cool` or `warm`. Returns: A dictionary containing the set brightness and color temperature. """ return { "brightness" : brightness , "colorTemperature" : color_temp } JavaScript import { Type } from '@google/genai' ; // Define a function that the model can call to control smart lights const setLightValuesFunctionDeclaration = { name : 'set_light_values' , description : 'Sets the brightness and color temperature of a light.' , parameters : { type : Type . OBJECT , properties : { brightness : { type : Type . NUMBER , description : 'Light level from 0 to 100. Zero is off and 100 is full brightness' , }, color_temp : { type : Type . STRING , enum : [ 'daylight' , 'cool' , 'warm' ], description : 'Color temperature of the light fixture, which can be `daylight`, `cool` or `warm`.' , }, }, required : [ 'brightness' , 'color_temp' ], }, }; /** * Set the brightness and color temperature of a room light. (mock API) * @param {number} brightness - Light level from 0 to 100. Zero is off and 100 is full brightness * @param {string} color_temp - Color temperature of the light fixture, which can be `daylight`, `cool` or `warm`. * @return {Object} A dictionary containing the set brightness and color temperature. */ function setLightValues ( brightness , color_temp ) { return { brightness : brightness , colorTemperature : color_temp }; } Step 2: Call the model with function declarations Once you have defined your function declarations, you can prompt the model to use them. It analyzes the prompt and function declarations and decides whether to respond directly or to call a function. If a function is called, the response object will contain a function call suggestion. Python from google.genai import types # Configure the client and tools client = genai . Client () tools = types . Tool ( function_declarations = [ set_light_values_declaration ]) config = types . GenerateContentConfig ( tools = [ tools ]) # Define user prompt contents = [ types . Content ( role = "user" , parts = [ types . Part ( text = "Turn the lights down to a romantic level" )] ) ] # Send request with function declarations response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = contents config = config , ) print ( response . candidates [ 0 ] . content . parts [ 0 ] . function_call ) JavaScript import { GoogleGenAI } from '@google/genai' ; // Generation config with function declaration const config = { tools : [{ functionDeclarations : [ setLightValuesFunctionDeclaration ] }] }; // Configure the client const ai = new GoogleGenAI ({}); // Define user prompt const contents = [ { role : 'user' , parts : [{ text : 'Turn the lights down to a romantic level' }] } ]; // Send request with function declarations const response = await ai . models . generateContent ({ model : 'gemini-2.5-flash' , contents : contents , config : config }); console . log ( response . functionCalls [ 0 ]); The model then returns a functionCall object in an OpenAPI compatible schema specifying how to call one or more of the declared functions in order to respond to the user's question. Python id = None args = { 'color_temp' : 'warm' , 'brightness' : 25 } name = 'set_light_values' JavaScript { name : 'set_light_values' , args : { brightness : 25 , color_temp : 'warm' } } Step 3: Execute set_light_values function code Extract the function call details from the model's response, parse the arguments , and execute the set_light_values function. Python # Extract tool call details, it may not be in the first part. tool_call = response . candidates [ 0 ] . content . parts [ 0 ] . function_call if tool_call . name == "set_light_values" : result = set_light_values ( ** tool_call . args ) print ( f "Function execution result: { result } " ) JavaScript // Extract tool call details const tool_call = response . functionCalls [ 0 ] let result ; if ( tool_call . name === 'set_light_values' ) { result = setLightValues ( tool_call . args . brightness , tool_call . args . color_temp ); console . log ( `Function execution result: ${ JSON . stringify ( result ) } ` ); } Step 4: Create user friendly response with function result and call the model again Finally, send the result of the function execution back to the model so it can incorporate this information into its final response to the user. Python # Create a function response part function_response_part = types . Part . from_function_response ( name = tool_call . name , response = { "result" : result }, ) # Append function call and result of the function execution to contents contents . append ( response . candidates [ 0 ] . content ) # Append the content from the model's response. contents . append ( types . Content ( role = "user" , parts = [ function_response_part ])) # Append the function response final_response = client . models . generate_content ( model = "gemini-2.5-flash" , config = config , contents = contents , ) print ( final_response . text ) JavaScript // Create a function response part const function_response_part = { name : tool_call . name , response : { result } } // Append function call and result of the function execution to contents contents . push ( response . candidates [ 0 ]. content ); contents . push ({ role : 'user' , parts : [{ functionResponse : function_response_part }] }); // Get the final response from the model const final_response = await ai . models . generateContent ({ model : 'gemini-2.5-flash' , contents : contents , config : config }); console . log ( final_response . text ); This completes the function calling flow. The model successfully used the set_light_values function to perform the request action of the user. Function declarations When you implement function calling in a prompt, you create a tools object, which contains one or more function declarations . You define functions using JSON, specifically with a select subset of the OpenAPI schema format. A single function declaration can include the following parameters: name (string): A unique name for the function ( get_weather_forecast , send_email ). Use descriptive names without spaces or special characters (use underscores or camelCase). description (string): A clear and detailed explanation of the function's purpose and capabilities. This is crucial for the model to understand when to use the function. Be specific and provide examples if helpful ("Finds theaters based on location and optionally movie title which is currently playing in theaters."). parameters (object): Defines the input parameters the function expects. type (string): Specifies the overall data type, such as object . properties (object): Lists individual parameters, each with: type (string): The data type of the parameter, such as string , integer , boolean, array . description (string): A description of the parameter's purpose and format. Provide examples and constraints ("The city and state, e.g., 'San Francisco, CA' or a zip code e.g., '95616'."). enum (array, optional): If the parameter values are from a fixed set, use "enum" to list the allowed values instead of just describing them in the description. This improves accuracy ("enum": ["daylight", "cool", "warm"]). required (array): An array of strings listing the parameter names that are mandatory for the function to operate. Function calling with thinking Enabling "thinking" can improve function call performance by allowing the model to reason through a request before suggesting function calls. However, because the Gemini API is stateless, this reasoning context is lost between turns, which can reduce the quality of function calls as they require multiple turn requests. To preserve this context you can use thought signatures. A thought signature is an encrypted representation of the model's internal thought process that you pass back to the model on subsequent turns. To use thought signatures: Receive the signature: When thinking is enabled, the API response will include a thought_signature field containing an encrypted representation of the model's reasoning. Return the signature: When you send the function's execution result back to the server, include the thought_signature you received. This allows the model to restore its previous thinking context and will likely result in better function calling performance. Receiving signatures from the server Signatures are returned in the part after the model's thinking phase, which typically is a text or function call. Here are some examples of what thought signatures look like returned in each type of part, in response to the request "What's the weather in Lake Tahoe?" using the Get Weather example: Text part [{ "candidates" : [ { "content" : { "parts" : [ { "text" : "Here's what the weather in Lake Tahoe is today" , "thoughtSignature" : "ClcBVKhc7ru7KzUI7SrdUoIdAYLm/+i93aHjfIt4xHyAoO/G70tApxnK2ujBhOhC1PrRy1pkQa88fqFvpHNVd1HDjNLO7mkp6/hFwE+SPPEB3fh0hs4oM8MKhgIBVKhc7uIGvrS7i/T4HpfbnYrluFfWNjZ62gewqe4cVdR/Dlh+zbjtYmDD0gPZ+SuBO7vvHQdzsjePRP+2Y5XddX6LEf/cGGgakq8EhVvw/a6IVzUO6XmpHg2Ag1sl8E9+VFH/lC0R0ZuYdFWligtDuYwp5p5q3o59G0TtWeU2MC1y2MJfE9u/KWd313ldka80/X2W/xF2O/4djMp5G2WKcULfve75zeRCy0mc5iS3SB9mTH0cT6x0vtKjeBx50gcg+CQWtJcRuwTVzz54dmvmK9xvnqA8gKGw3DuaM9wfy5hyY7Qg0z3iyyWdP8T/lbjKim8IEQOk7O1vVwP1Ko7oMYH8JgA1CsoBAVSoXO6v4c5RSyd1cn6EIU0pEFQsjW7rYWPuZdOFq/tsGJT9BCfW7KGkPGwlNSq8jTJFvbcJ/DjtndISQYXwiXd2kGa5JfdS2Kh4zOxCxiWtOk+2nCc3+XQk2nonhO+esGJpkDdbbHZSqRgcUtYKq7q28iPFOQvOFyCiZNB7K86Z/6Hnagu2snSlN/BcTMaFGaWpcCClSUo4foRZn3WbNCoM8rcpD7qEJMp4a5baaSxyyeL1ZTGd2HLpFys/oiW6e3oAnhxuIysCwg==" } ] , "role" : "model" } , "index" : 0 } ] , # Remainder of response... Function call part [{ "candidates" : [ { "content" : { "parts" : [ { "functionCall" : { "name" : "getWeather" , "args" : { "city" : "Lake Tahoe" } } , "thoughtSignature" : "CiwBVKhc7nRyTi3HmggPD9iQiRc261f5jwuMdw3H/itDH0emsb9ZVo3Nwx9p6wpsAVSoXO5i8fDV4jBSBLoaWxB5zUdlGY6aIGp+I0oEnwRRSRQ1LOvrDlojEH8JE8HjiKXALdJrvNPiG+HY3GZEO8pZjEZtc3UoBUh7+SVyjK7Xolu7aRYYeUyzrCapoETWypER1jbrJXnFV23hCosBAVSoXO6oIPNJSmbuEDfGafOhuCSHkpr1yjTp35RXYqmCESzRzWf5+nFXLqncqeFo4ohoxbiYQVpVQbOZF81p8o9zg6xeRE7qMeOv+XN7enXGJ4/s3qNFQpfkSMqRdBITN1VpX7jyfEAjvxBNc7PDfDJZmEPY338ZIY5nFFcmzJSWjVrboFt2sMFv+A==" } ] , "role" : "model" } , "finishReason" : "STOP" , "index" : 0 } ] , # Remainder of response... You can confirm that you received a signature and see what a signature looks like using the following code: # Step 2: Call the model with function declarations # ...Generation config, Configure the client, and Define user prompt (No changes) # Send request with declarations (using a thinking model) response = client . models . generate_content ( model = "gemini-2.5-flash" , config = config , contents = contents ) # See thought signatures for part in response . candidates [ 0 ] . content . parts : if part . thought_signature : print ( "Thought signature:" ) print ( part . thought_signature ) Returning signatures back to the server In order to return signatures back: You should return signatures along with their containing parts back to the server You shouldn't merge a part with a signature with another part which also contains a signature. The signature string is not concatenable You shouldn't merge one part with a signature with another part without a signature. This breaks the correct positioning of the thought represented by the signature. The code will remain the same as in Step 4 of the previous section. But in this case (as indicated in the comment below) you will return signatures to the model along with the result of the function execution so the model can incorporate the thoughts into its final response: Python # Step 4: Create user friendly response with function result and call the model again # ...Create a function response part (No change) # Append thought signatures, function call and result of the function execution to contents function_call_content = response . candidates [ 0 ] . content # Append the model's function call message, which includes thought signatures contents . append ( function_call_content ) contents . append ( types . Content ( role = "user" , parts = [ function_response_part ])) # Append the function response final_response = client . models . generate_content ( model = "gemini-2.5-flash" , config = config , contents = contents , ) print ( final_response . text ) JavaScript // Step 4: Create user friendly response with function result and call the model again // ...Create a function response part (No change) // Append thought signatures, function call and result of the function execution to contents const function_response_content = response . candidates [ 0 ]. content ; contents . push ( function_response_content ); contents . push ({ role : 'user' , parts : [{ functionResponse : function_response_part }] }); const final_response = await ai . models . generateContent ({ model : 'gemini-2.5-flash' , contents : contents , config : config }); console . log ( final_response . text ); The following shows what a request returning a thought signature may look like: [{ "contents" : [ { "role" : "user" , "parts" : [ { "text" : "what is the weather in Lake Tahoe?" } ] } , { "parts" : [ { "functionCall" : { "name" : "getWeather" , "args" : { "city" : "Lake Tahoe" } } , "thoughtSignature" : "CiIBVKhc7oDPpCaXyJKKssjqr4g3JNOSgJ/M2V+1THC1icsWCmwBVKhc7pBABbZ+zR3e9234WnWWS6GFXmf8IVwpnzjd5KYd7vyJbn/4vTorWBGayj/vbd9JPaZQjxdAIXhoE5mX/MDsQ7M9N/b0qJjHm39tYIBvS4sIWkMDHqTJqXGLzhhKtrTkfbV3RbaJEkQKmwEBVKhc7qVUgC3hfTXZLo9R3AJzUUIx50NKvJTb9B+UU+LBqgg7Nck1x5OpjWVS2R+SsveprIuYOruk2Y0H53J2OJF8qsxTdIq2si8DGW2V7WK8xyoJH5kbqd7drIw1jLb44b6lx4SMyB0VaULuTBki4d+Ljjg1tJTwR0IYMKqDLDZt9mheINsi0ZxcNjfpnDydRXdWbcSwzmK/wgqJAQFUqFzuKgNVElxs3cbO+xebr2IwcOro84nKTisi0tTp9bICPC9fTUhn3L+rvQWA+d3J1Za8at2bakrqiRj7BTh+CVO9fWQMAEQAs3ni0Z2hfaYG92tOD26E4IoZwyYEoWbfNudpH1fr5tEkyqnEGtWIh7H+XoZQ2DXeiOa+br7Zk88SrNE+trJMCogBAVSoXO5e9fBLg7hnbkmKsrzNLnQtLsQm1gNzjcjEC7nJYklYPp0KI2uGBE1PkM8XNsfllAfHVn7LzHcHNlbQ9pJ7QZTSIeG42goS971r5wNZwxaXwCTphClQh826eqJWo6A/28TtAVQWLhTx5ekbP7qb4nh1UblESZ1saxDQAEo4OKPbDzx5BgqKAQFUqFzuVyjNm5i0wN8hTDnKjfpDroEpPPTs531iFy9BOX+xDCdGHy8D+osFpaoBq6TFekQQbz4hIoUR1YEcP4zI80/cNimEeb9IcFxZTTxiNrbhbbcv0969DSMWhB+ZEqIz4vuw4GLe/xcUvqhlChQwFdgIbdOQHSHpatn5uDlktnP/bi26nKuXIwo0AVSoXO7US22OUH7d1f4abNPI0IyAvhqkPp12rbtWLx9vkOtojE8IP+xCfYtIFuZIzRNZqA==" } ] , "role" : "model" } , { "role" : "user" , "parts" : [ { "functionResponse" : { "name" : "getWeather" , "response" : { "response" : { "stringValue" : "Sunny and hot. 90 degrees Fahrenheit" } } } } ] } ] , # Remainder of request... Learn more about limitations and usage of thought signatures, and about thinking models in general, on the Thinking page. Parallel function calling In addition to single turn function calling, you can also call multiple functions at once. Parallel function calling lets you execute multiple functions at once and is used when the functions are not dependent on each other. This is useful in scenarios like gathering data from multiple independent sources, such as retrieving customer details from different databases or checking inventory levels across various warehouses or performing multiple actions such as converting your apartment into a disco. Python power_disco_ball = { "name" : "power_disco_ball" , "description" : "Powers the spinning disco ball." , "parameters" : { "type" : "object" , "properties" : { "power" : { "type" : "boolean" , "description" : "Whether to turn the disco ball on or off." , } }, "required" : [ "power" ], }, } start_music = { "name" : "start_music" , "description" : "Play some music matching the specified parameters." , "parameters" : { "type" : "object" , "properties" : { "energetic" : { "type" : "boolean" , "description" : "Whether the music is energetic or not." , }, "loud" : { "type" : "boolean" , "description" : "Whether the music is loud or not." , }, }, "required" : [ "energetic" , "loud" ], }, } dim_lights = { "name" : "dim_lights" , "description" : "Dim the lights." , "parameters" : { "type" : "object" , "properties" : { "brightness" : { "type" : "number" , "description" : "The brightness of the lights, 0.0 is off, 1.0 is full." , } }, "required" : [ "brightness" ], }, } JavaScript import { Type } from '@google/genai' ; const powerDiscoBall = { name : 'power_disco_ball' , description : 'Powers the spinning disco ball.' , parameters : { type : Type . OBJECT , properties : { power : { type : Type . BOOLEAN , description : 'Whether to turn the disco ball on or off.' } }, required : [ 'power' ] } }; const startMusic = { name : 'start_music' , description : 'Play some music matching the specified parameters.' , parameters : { type : Type . OBJECT , properties : { energetic : { type : Type . BOOLEAN , description : 'Whether the music is energetic or not.' }, loud : { type : Type . BOOLEAN , description : 'Whether the music is loud or not.' } }, required : [ 'energetic' , 'loud' ] } }; const dimLights = { name : 'dim_lights' , description : 'Dim the lights.' , parameters : { type : Type . OBJECT , properties : { brightness : { type : Type . NUMBER , description : 'The brightness of the lights, 0.0 is off, 1.0 is full.' } }, required : [ 'brightness' ] } }; Configure the function calling mode to allow using all of the specified tools. To learn more, you can read about configuring function calling . Python from google import genai from google.genai import types # Configure the client and tools client = genai . Client () house_tools = [ types . Tool ( function_declarations = [ power_disco_ball , start_music , dim_lights ]) ] config = types . GenerateContentConfig ( tools = house_tools , automatic_function_calling = types . AutomaticFunctionCallingConfig ( disable = True ), # Force the model to call 'any' function, instead of chatting. tool_config = types . ToolConfig ( function_calling_config = types . FunctionCallingConfig ( mode = 'ANY' ) ), ) chat = client . chats . create ( model = "gemini-2.5-flash" , config = config ) response = chat . send_message ( "Turn this place into a party!" ) # Print out each of the function calls requested from this single call print ( "Example 1: Forced function calling" ) for fn in response . function_calls : args = ", " . join ( f " { key } = { val } " for key , val in fn . args . items ()) print ( f " { fn . name } ( { args } )" ) JavaScript import { GoogleGenAI } from '@google/genai' ; // Set up function declarations const houseFns = [ powerDiscoBall , startMusic , dimLights ]; const config = { tools : [{ functionDeclarations : houseFns }], // Force the model to call 'any' function, instead of chatting. toolConfig : { functionCallingConfig : { mode : 'any' } } }; // Configure the client const ai = new GoogleGenAI ({}); // Create a chat session const chat = ai . chats . create ({ model : 'gemini-2.5-flash' , config : config }); const response = await chat . sendMessage ({ message : 'Turn this place into a party!' }); // Print out each of the function calls requested from this single call console . log ( "Example 1: Forced function calling" ); for ( const fn of response . functionCalls ) { const args = Object . entries ( fn . args ) . map (([ key , val ]) = > ` ${ key } = ${ val } ` ) . join ( ', ' ); console . log ( ` ${ fn . name } ( ${ args } )` ); } Each of the printed results reflects a single function call that the model has requested. To send the results back, include the responses in the same order as they were requested. The Python SDK supports automatic function calling , which automatically converts Python functions to declarations, handles the function call execution and response cycle for you. Following is an example for the disco use case. Note: Automatic Function Calling is a Python SDK only feature at the moment. Python from google import genai from google.genai import types # Actual function implementations def power_disco_ball_impl ( power : bool ) - > dict : """Powers the spinning disco ball. Args: power: Whether to turn the disco ball on or off. Returns: A status dictionary indicating the current state. """ return { "status" : f "Disco ball powered { 'on' if power else 'off' } " } def start_music_impl ( energetic : bool , loud : bool ) - > dict : """Play some music matching the specified parameters. Args: energetic: Whether the music is energetic or not. loud: Whether the music is loud or not. Returns: A dictionary containing the music settings. """ music_type = "energetic" if energetic else "chill" volume = "loud" if loud else "quiet" return { "music_type" : music_type , "volume" : volume } def dim_lights_impl ( brightness : float ) - > dict : """Dim the lights. Args: brightness: The brightness of the lights, 0.0 is off, 1.0 is full. Returns: A dictionary containing the new brightness setting. """ return { "brightness" : brightness } # Configure the client client = genai . Client () config = types . GenerateContentConfig ( tools = [ power_disco_ball_impl , start_music_impl , dim_lights_impl ] ) # Make the request response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = "Do everything you need to this place into party!" , config = config , ) print ( " \n Example 2: Automatic function calling" ) print ( response . text ) # I've turned on the disco ball, started playing loud and energetic music, and dimmed the lights to 50% brightness. Let's get this party started! Compositional function calling Compositional or sequential function calling allows Gemini to chain multiple function calls together to fulfill a complex request. For example, to answer "Get the temperature in my current location", the Gemini API might first invoke a get_current_location() function followed by a get_weather() function that takes the location as a parameter. The following example demonstrates how to implement compositional function calling using the Python SDK and automatic function calling. Python This example uses the automatic function calling feature of the google-genai Python SDK. The SDK automatically converts the Python functions to the required schema, executes the function calls when requested by the model, and sends the results back to the model to complete the task. import os from google import genai from google.genai import types # Example Functions def get_weather_forecast ( location : str ) - > dict : """Gets the current weather temperature for a given location.""" print ( f "Tool Call: get_weather_forecast(location= { location } )" ) # TODO: Make API call print ( "Tool Response: {'temperature': 25, 'unit': 'celsius'}" ) return { "temperature" : 25 , "unit" : "celsius" } # Dummy response def set_thermostat_temperature ( temperature : int ) - > dict : """Sets the thermostat to a desired temperature.""" print ( f "Tool Call: set_thermostat_temperature(temperature= { temperature } )" ) # TODO: Interact with a thermostat API print ( "Tool Response: {'status': 'success'}" ) return { "status" : "success" } # Configure the client and model client = genai . Client () config = types . GenerateContentConfig ( tools = [ get_weather_forecast , set_thermostat_temperature ] ) # Make the request response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = "If it's warmer than 20°C in London, set the thermostat to 20°C, otherwise set it to 18°C." , config = config , ) # Print the final, user-facing response print ( response . text ) Expected Output When you run the code, you will see the SDK orchestrating the function calls. The model first calls get_weather_forecast , receives the temperature, and then calls set_thermostat_temperature with the correct value based on the logic in the prompt. Tool Call : get_weather_forecast ( location = London ) Tool Response : { 'temperature' : 25 , 'unit' : 'celsius' } Tool Call : set_thermostat_temperature ( temperature = 20 ) Tool Response : { 'status' : 'success' } OK . I 've set the thermostat to 20°C. JavaScript This example shows how to use JavaScript/TypeScript SDK to do comopositional function calling using a manual execution loop. import { GoogleGenAI , Type } from "@google/genai" ; // Configure the client const ai = new GoogleGenAI ({}); // Example Functions function get_weather_forecast ({ location }) { console . log ( `Tool Call: get_weather_forecast(location= ${ location } )` ); // TODO: Make API call console . log ( "Tool Response: {'temperature': 25, 'unit': 'celsius'}" ); return { temperature : 25 , unit : "celsius" }; } function set_thermostat_temperature ({ temperature }) { console . log ( `Tool Call: set_thermostat_temperature(temperature= ${ temperature } )` , ); // TODO: Make API call console . log ( "Tool Response: {'status': 'success'}" ); return { status : "success" }; } const toolFunctions = { get_weather_forecast , set_thermostat_temperature , }; const tools = [ { functionDeclarations : [ { name : "get_weather_forecast" , description : "Gets the current weather temperature for a given location." , parameters : { type : Type . OBJECT , properties : { location : { type : Type . STRING , }, }, required : [ "location" ], }, }, { name : "set_thermostat_temperature" , description : "Sets the thermostat to a desired temperature." , parameters : { type : Type . OBJECT , properties : { temperature : { type : Type . NUMBER , }, }, required : [ "temperature" ], }, }, ], }, ]; // Prompt for the model let contents = [ { role : "user" , parts : [ { text : "If it's warmer than 20°C in London, set the thermostat to 20°C, otherwise set it to 18°C." , }, ], }, ]; // Loop until the model has no more function calls to make while ( true ) { const result = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents , config : { tools }, }); if ( result . functionCalls && result . functionCalls . length > 0 ) { const functionCall = result . functionCalls [ 0 ]; const { name , args } = functionCall ; if ( ! toolFunctions [ name ]) { throw new Error ( `Unknown function call: ${ name } ` ); } // Call the function and get the response. const toolResponse = toolFunctions [ name ]( args ); const functionResponsePart = { name : functionCall . name , response : { result : toolResponse , }, }; // Send the function response back to the model. contents . push ({ role : "model" , parts : [ { functionCall : functionCall , }, ], }); contents . push ({ role : "user" , parts : [ { functionResponse : functionResponsePart , }, ], }); } else { // No more function calls, break the loop. console . log ( result . text ); break ; } } Expected Output When you run the code, you will see the SDK orchestrating the function calls. The model first calls get_weather_forecast , receives the temperature, and then calls set_thermostat_temperature with the correct value based on the logic in the prompt. Tool Call : get_weather_forecast ( location = London ) Tool Response : { 'temperature' : 25 , 'unit' : 'celsius' } Tool Call : set_thermostat_temperature ( temperature = 20 ) Tool Response : { 'status' : 'success' } OK . It 's 25°C in London, so I' ve set the thermostat to 20 ° C . Compositional function calling is a native Live API feature. This means Live API can handle the function calling similar to the Python SDK. Python # Light control schemas turn_on_the_lights_schema = { 'name' : 'turn_on_the_lights' } turn_off_the_lights_schema = { 'name' : 'turn_off_the_lights' } prompt = """ Hey, can you write run some python code to turn on the lights, wait 10s and then turn off the lights? """ tools = [ { 'code_execution' : {}}, { 'function_declarations' : [ turn_on_the_lights_schema , turn_off_the_lights_schema ]} ] await run ( prompt , tools = tools , modality = "AUDIO" ) JavaScript // Light control schemas const turnOnTheLightsSchema = { name : 'turn_on_the_lights' }; const turnOffTheLightsSchema = { name : 'turn_off_the_lights' }; const prompt = ` Hey, can you write run some python code to turn on the lights, wait 10s and then turn off the lights? ` ; const tools = [ { codeExecution : {} }, { functionDeclarations : [ turnOnTheLightsSchema , turnOffTheLightsSchema ] } ]; await run ( prompt , tools = tools , modality = "AUDIO" ) Function calling modes The Gemini API lets you control how the model uses the provided tools (function declarations). Specifically, you can set the mode within the. function_calling_config . AUTO (Default) : The model decides whether to generate a natural language response or suggest a function call based on the prompt and context. This is the most flexible mode and recommended for most scenarios. ANY : The model is constrained to always predict a function call and guarantees function schema adherence. If allowed_function_names is not specified, the model can choose from any of the provided function declarations. If allowed_function_names is provided as a list, the model can only choose from the functions in that list. Use this mode when you require a function call response to every prompt (if applicable). NONE : The model is prohibited from making function calls. This is equivalent to sending a request without any function declarations. Use this to temporarily disable function calling without removing your tool definitions. Python from google.genai import types # Configure function calling mode tool_config = types . ToolConfig ( function_calling_config = types . FunctionCallingConfig ( mode = "ANY" , allowed_function_names = [ "get_current_temperature" ] ) ) # Create the generation config config = types . GenerateContentConfig ( tools = [ tools ], # not defined here. tool_config = tool_config , ) JavaScript import { FunctionCallingConfigMode } from '@google/genai' ; // Configure function calling mode const toolConfig = { functionCallingConfig : { mode : FunctionCallingConfigMode . ANY , allowedFunctionNames : [ 'get_current_temperature' ] } }; // Create the generation config const config = { tools : tools , // not defined here. toolConfig : toolConfig , }; Automatic function calling (Python only) When using the Python SDK, you can provide Python functions directly as tools. The SDK automatically converts the Python function to declarations, handles the function call execution and the response cycle for you. The Python SDK then automatically: Detects function call responses from the model. Call the corresponding Python function in your code. Sends the function response back to the model. Returns the model's final text response. To use this, define your function with type hints and a docstring, and then pass the function itself (not a JSON declaration) as a tool: Python from google import genai from google.genai import types # Define the function with type hints and docstring def get_current_temperature ( location : str ) - > dict : """Gets the current temperature for a given location. Args: location: The city and state, e.g. San Francisco, CA Returns: A dictionary containing the temperature and unit. """ # ... (implementation) ... return { "temperature" : 25 , "unit" : "Celsius" } # Configure the client client = genai . Client () config = types . GenerateContentConfig ( tools = [ get_current_temperature ] ) # Pass the function itself # Make the request response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = "What's the temperature in Boston?" , config = config , ) print ( response . text ) # The SDK handles the function call and returns the final text You can disable automatic function calling with: Python config = types . GenerateContentConfig ( tools = [ get_current_temperature ], automatic_function_calling = types . AutomaticFunctionCallingConfig ( disable = True ) ) Automatic function schema declaration Automatic schema extraction from Python functions doesn't work in all cases. For example, it doesn't handle cases where you describe the fields of a nested dictionary-object. The API is able to describe any of the following types: Python AllowedType = ( int | float | bool | str | list [ 'AllowedType' ] | dict [ str , AllowedType ]) To see what the inferred schema looks like, you can convert it using from_callable : Python def multiply ( a : float , b : float ): """Returns a * b.""" return a * b fn_decl = types . FunctionDeclaration . from_callable ( callable = multiply , client = client ) # to_json_dict() provides a clean JSON representation. print ( fn_decl . to_json_dict ()) Multi-tool use: Combine native tools with function calling You can enable multiple tools combining native tools with function calling at the same time. Here's an example that enables two tools, Grounding with Google Search and code execution , in a request using the Live API . Note: Multi-tool use is a- Live API only feature at the moment. The run() function declaration, which handles the asynchronous websocket setup, is omitted for brevity. Python # Multiple tasks example - combining lights, code execution, and search prompt = """ Hey, I need you to do three things for me. 1. Turn on the lights. 2. Then compute the largest prime palindrome under 100000. 3. Then use Google Search to look up information about the largest earthquake in California the week of Dec 5 2024. Thanks! """ tools = [ { 'google_search' : {}}, { 'code_execution' : {}}, { 'function_declarations' : [ turn_on_the_lights_schema , turn_off_the_lights_schema ]} # not defined here. ] # Execute the prompt with specified tools in audio modality await run ( prompt , tools = tools , modality = "AUDIO" ) JavaScript // Multiple tasks example - combining lights, code execution, and search const prompt = ` Hey, I need you to do three things for me. 1. Turn on the lights. 2. Then compute the largest prime palindrome under 100000. 3. Then use Google Search to look up information about the largest earthquake in California the week of Dec 5 2024. Thanks! ` ; const tools = [ { googleSearch : {} }, { codeExecution : {} }, { functionDeclarations : [ turnOnTheLightsSchema , turnOffTheLightsSchema ] } // not defined here. ]; // Execute the prompt with specified tools in audio modality await run ( prompt , { tools : tools , modality : "AUDIO" }); Python developers can try this out in the Live API Tool Use notebook . Model context protocol (MCP) Model Context Protocol (MCP) is an open standard for connecting AI applications with external tools and data. MCP provides a common protocol for models to access context, such as functions (tools), data sources (resources), or predefined prompts. The Gemini SDKs have built-in support for the MCP, reducing boilerplate code and offering automatic tool calling for MCP tools. When the model generates an MCP tool call, the Python and JavaScript client SDK can automatically execute the MCP tool and send the response back to the model in a subsequent request, continuing this loop until no more tool calls are made by the model. Here, you can find an example of how to use a local MCP server with Gemini and mcp SDK. Python Make sure the latest version of the mcp SDK is installed on your platform of choice. pip install mcp Note: Python supports automatic tool calling by passing in the ClientSession into the tools parameters. If you want to disable it, you can provide automatic_function_calling with disabled True . import os import asyncio from datetime import datetime from mcp import ClientSession , StdioServerParameters from mcp.client.stdio import stdio_client from google import genai client = genai . Client () # Create server parameters for stdio connection server_params = StdioServerParameters ( command = "npx" , # Executable args = [ "-y" , "@philschmid/weather-mcp" ], # MCP Server env = None , # Optional environment variables ) async def run (): async with stdio_client ( server_params ) as ( read , write ): async with ClientSession ( read , write ) as session : # Prompt to get the weather for the current day in London. prompt = f "What is the weather in London in { datetime . now () . strftime ( '%Y-%m- %d ' ) } ?" # Initialize the connection between client and server await session . initialize () # Send request to the model with MCP function declarations response = await client . aio . models . generate_content ( model = "gemini-2.5-flash" , contents = prompt , config = genai . types . GenerateContentConfig ( temperature = 0 , tools = [ session ], # uses the session, will automatically call the tool # Uncomment if you **don't** want the SDK to automatically call the tool # automatic_function_calling=genai.types.AutomaticFunctionCallingConfig( # disable=True # ), ), ) print ( response . text ) # Start the asyncio event loop and run the main function asyncio . run ( run ()) JavaScript Make sure the latest version of the mcp SDK is installed on your platform of choice. npm install @modelcontextprotocol/sdk Note: JavaScript supports automatic tool calling by wrapping the client with mcpToTool . If you want to disable it, you can provide automaticFunctionCalling with disabled true . import { GoogleGenAI , FunctionCallingConfigMode , mcpToTool } from '@google/genai' ; import { Client } from "@modelcontextprotocol/sdk/client/index.js" ; import { StdioClientTransport } from "@modelcontextprotocol/sdk/client/stdio.js" ; // Create server parameters for stdio connection const serverParams = new StdioClientTransport ({ command : "npx" , // Executable args : [ "-y" , "@philschmid/weather-mcp" ] // MCP Server }); const client = new Client ( { name : "example-client" , version : "1.0.0" } ); // Configure the client const ai = new GoogleGenAI ({}); // Initialize the connection between client and server await client . connect ( serverParams ); // Send request to the model with MCP tools const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : `What is the weather in London in ${ new Date (). toLocaleDateString () } ?` , config : { tools : [ mcpToTool ( client )], // uses the session, will automatically call the tool // Uncomment if you **don't** want the sdk to automatically call the tool // automaticFunctionCalling: { // disable: true, // }, }, }); console . log ( response . text ) // Close the connection await client . close (); Limitations with built-in MCP support Built-in MCP support is a experimental feature in our SDKs and has the following limitations: Only tools are supported, not resources nor prompts It is available for the Python and JavaScript/TypeScript SDK. Breaking changes might occur in future releases. Manual integration of MCP servers is always an option if these limit what you're building. Supported models This section lists models and their function calling capabilities. Experimental models are not included. You can find a comprehensive capabilities overview on the model overview page. Model Function Calling Parallel Function Calling Compositional Function Calling Gemini 2.5 Pro ✔️ ✔️ ✔️ Gemini 2.5 Flash ✔️ ✔️ ✔️ Gemini 2.5 Flash-Lite ✔️ ✔️ ✔️ Gemini 2.0 Flash ✔️ ✔️ ✔️ Gemini 2.0 Flash-Lite X X X Best practices Function and Parameter Descriptions: Be extremely clear and specific in your descriptions. The model relies on these to choose the correct function and provide appropriate arguments. Naming: Use descriptive function names (without spaces, periods, or dashes). Strong Typing: Use specific types (integer, string, enum) for parameters to reduce errors. If a parameter has a limited set of valid values, use an enum. Tool Selection: While the model can use an arbitrary number of tools, providing too many can increase the risk of selecting an incorrect or suboptimal tool. For best results, aim to provide only the relevant tools for the context or task, ideally keeping the active set to a maximum of 10-20. Consider dynamic tool selection based on conversation context if you have a large total number of tools. Prompt Engineering: Provide context: Tell the model its role (e.g., "You are a helpful weather assistant."). Give instructions: Specify how and when to use functions (e.g., "Don't guess dates; always use a future date for forecasts."). Encourage clarification: Instruct the model to ask clarifying questions if needed. Temperature: Use a low temperature (e.g., 0) for more deterministic and reliable function calls. Validation: If a function call has significant consequences (e.g., placing an order), validate the call with the user before executing it. Error Handling : Implement robust error handling in your functions to gracefully handle unexpected inputs or API failures. Return informative error messages that the model can use to generate helpful responses to the user. Security: Be mindful of security when calling external APIs. Use appropriate authentication and authorization mechanisms. Avoid exposing sensitive data in function calls. Token Limits: Function descriptions and parameters count towards your input token limit. If you're hitting token limits, consider limiting the number of functions or the length of the descriptions, break down complex tasks into smaller, more focused function sets. Notes and limitations Only a subset of the OpenAPI schema is supported. Supported parameter types in Python are limited. Automatic function calling is a Python SDK feature only. Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-10 UTC. \ No newline at end of file diff --git a/text_content/docs_function-calling_e09d4451.txt b/text_content/docs_function-calling_e09d4451.txt new file mode 100644 index 0000000000000000000000000000000000000000..b8f1873dc792776af625820de77674a559052cc6 --- /dev/null +++ b/text_content/docs_function-calling_e09d4451.txt @@ -0,0 +1,5 @@ +URL: https://ai.google.dev/gemini-api/docs/function-calling#thinking +Title: Function calling with the Gemini API  |  Google AI for Developers +================================================== + +Function calling with the Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Function calling with the Gemini API Function calling lets you connect models to external tools and APIs. Instead of generating text responses, the model determines when to call specific functions and provides the necessary parameters to execute real-world actions. This allows the model to act as a bridge between natural language and real-world actions and data. Function calling has 3 primary use cases: Augment Knowledge: Access information from external sources like databases, APIs, and knowledge bases. Extend Capabilities: Use external tools to perform computations and extend the limitations of the model, such as using a calculator or creating charts. Take Actions: Interact with external systems using APIs, such as scheduling appointments, creating invoices, sending emails, or controlling smart home devices. Get Weather Schedule Meeting Create Chart How function calling works Function calling involves a structured interaction between your application, the model, and external functions. Here's a breakdown of the process: Define Function Declaration: Define the function declaration in your application code. Function Declarations describe the function's name, parameters, and purpose to the model. Call LLM with function declarations: Send user prompt along with the function declaration(s) to the model. It analyzes the request and determines if a function call would be helpful. If so, it responds with a structured JSON object. Execute Function Code (Your Responsibility): The Model does not execute the function itself. It's your application's responsibility to process the response and check for Function Call, if Yes : Extract the name and args of the function and execute the corresponding function in your application. No: The model has provided a direct text response to the prompt (this flow is less emphasized in the example but is a possible outcome). Create User friendly response: If a function was executed, capture the result and send it back to the model in a subsequent turn of the conversation. It will use the result to generate a final, user-friendly response that incorporates the information from the function call. This process can be repeated over multiple turns, allowing for complex interactions and workflows. The model also supports calling multiple functions in a single turn ( parallel function calling ) and in sequence ( compositional function calling ). Step 1: Define a function declaration Define a function and its declaration within your application code that allows users to set light values and make an API request. This function could call external services or APIs. Python # Define a function that the model can call to control smart lights set_light_values_declaration = { "name" : "set_light_values" , "description" : "Sets the brightness and color temperature of a light." , "parameters" : { "type" : "object" , "properties" : { "brightness" : { "type" : "integer" , "description" : "Light level from 0 to 100. Zero is off and 100 is full brightness" , }, "color_temp" : { "type" : "string" , "enum" : [ "daylight" , "cool" , "warm" ], "description" : "Color temperature of the light fixture, which can be `daylight`, `cool` or `warm`." , }, }, "required" : [ "brightness" , "color_temp" ], }, } # This is the actual function that would be called based on the model's suggestion def set_light_values ( brightness : int , color_temp : str ) - > dict [ str , int | str ]: """Set the brightness and color temperature of a room light. (mock API). Args: brightness: Light level from 0 to 100. Zero is off and 100 is full brightness color_temp: Color temperature of the light fixture, which can be `daylight`, `cool` or `warm`. Returns: A dictionary containing the set brightness and color temperature. """ return { "brightness" : brightness , "colorTemperature" : color_temp } JavaScript import { Type } from '@google/genai' ; // Define a function that the model can call to control smart lights const setLightValuesFunctionDeclaration = { name : 'set_light_values' , description : 'Sets the brightness and color temperature of a light.' , parameters : { type : Type . OBJECT , properties : { brightness : { type : Type . NUMBER , description : 'Light level from 0 to 100. Zero is off and 100 is full brightness' , }, color_temp : { type : Type . STRING , enum : [ 'daylight' , 'cool' , 'warm' ], description : 'Color temperature of the light fixture, which can be `daylight`, `cool` or `warm`.' , }, }, required : [ 'brightness' , 'color_temp' ], }, }; /** * Set the brightness and color temperature of a room light. (mock API) * @param {number} brightness - Light level from 0 to 100. Zero is off and 100 is full brightness * @param {string} color_temp - Color temperature of the light fixture, which can be `daylight`, `cool` or `warm`. * @return {Object} A dictionary containing the set brightness and color temperature. */ function setLightValues ( brightness , color_temp ) { return { brightness : brightness , colorTemperature : color_temp }; } Step 2: Call the model with function declarations Once you have defined your function declarations, you can prompt the model to use them. It analyzes the prompt and function declarations and decides whether to respond directly or to call a function. If a function is called, the response object will contain a function call suggestion. Python from google.genai import types # Configure the client and tools client = genai . Client () tools = types . Tool ( function_declarations = [ set_light_values_declaration ]) config = types . GenerateContentConfig ( tools = [ tools ]) # Define user prompt contents = [ types . Content ( role = "user" , parts = [ types . Part ( text = "Turn the lights down to a romantic level" )] ) ] # Send request with function declarations response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = contents config = config , ) print ( response . candidates [ 0 ] . content . parts [ 0 ] . function_call ) JavaScript import { GoogleGenAI } from '@google/genai' ; // Generation config with function declaration const config = { tools : [{ functionDeclarations : [ setLightValuesFunctionDeclaration ] }] }; // Configure the client const ai = new GoogleGenAI ({}); // Define user prompt const contents = [ { role : 'user' , parts : [{ text : 'Turn the lights down to a romantic level' }] } ]; // Send request with function declarations const response = await ai . models . generateContent ({ model : 'gemini-2.5-flash' , contents : contents , config : config }); console . log ( response . functionCalls [ 0 ]); The model then returns a functionCall object in an OpenAPI compatible schema specifying how to call one or more of the declared functions in order to respond to the user's question. Python id = None args = { 'color_temp' : 'warm' , 'brightness' : 25 } name = 'set_light_values' JavaScript { name : 'set_light_values' , args : { brightness : 25 , color_temp : 'warm' } } Step 3: Execute set_light_values function code Extract the function call details from the model's response, parse the arguments , and execute the set_light_values function. Python # Extract tool call details, it may not be in the first part. tool_call = response . candidates [ 0 ] . content . parts [ 0 ] . function_call if tool_call . name == "set_light_values" : result = set_light_values ( ** tool_call . args ) print ( f "Function execution result: { result } " ) JavaScript // Extract tool call details const tool_call = response . functionCalls [ 0 ] let result ; if ( tool_call . name === 'set_light_values' ) { result = setLightValues ( tool_call . args . brightness , tool_call . args . color_temp ); console . log ( `Function execution result: ${ JSON . stringify ( result ) } ` ); } Step 4: Create user friendly response with function result and call the model again Finally, send the result of the function execution back to the model so it can incorporate this information into its final response to the user. Python # Create a function response part function_response_part = types . Part . from_function_response ( name = tool_call . name , response = { "result" : result }, ) # Append function call and result of the function execution to contents contents . append ( response . candidates [ 0 ] . content ) # Append the content from the model's response. contents . append ( types . Content ( role = "user" , parts = [ function_response_part ])) # Append the function response final_response = client . models . generate_content ( model = "gemini-2.5-flash" , config = config , contents = contents , ) print ( final_response . text ) JavaScript // Create a function response part const function_response_part = { name : tool_call . name , response : { result } } // Append function call and result of the function execution to contents contents . push ( response . candidates [ 0 ]. content ); contents . push ({ role : 'user' , parts : [{ functionResponse : function_response_part }] }); // Get the final response from the model const final_response = await ai . models . generateContent ({ model : 'gemini-2.5-flash' , contents : contents , config : config }); console . log ( final_response . text ); This completes the function calling flow. The model successfully used the set_light_values function to perform the request action of the user. Function declarations When you implement function calling in a prompt, you create a tools object, which contains one or more function declarations . You define functions using JSON, specifically with a select subset of the OpenAPI schema format. A single function declaration can include the following parameters: name (string): A unique name for the function ( get_weather_forecast , send_email ). Use descriptive names without spaces or special characters (use underscores or camelCase). description (string): A clear and detailed explanation of the function's purpose and capabilities. This is crucial for the model to understand when to use the function. Be specific and provide examples if helpful ("Finds theaters based on location and optionally movie title which is currently playing in theaters."). parameters (object): Defines the input parameters the function expects. type (string): Specifies the overall data type, such as object . properties (object): Lists individual parameters, each with: type (string): The data type of the parameter, such as string , integer , boolean, array . description (string): A description of the parameter's purpose and format. Provide examples and constraints ("The city and state, e.g., 'San Francisco, CA' or a zip code e.g., '95616'."). enum (array, optional): If the parameter values are from a fixed set, use "enum" to list the allowed values instead of just describing them in the description. This improves accuracy ("enum": ["daylight", "cool", "warm"]). required (array): An array of strings listing the parameter names that are mandatory for the function to operate. Function calling with thinking Enabling "thinking" can improve function call performance by allowing the model to reason through a request before suggesting function calls. However, because the Gemini API is stateless, this reasoning context is lost between turns, which can reduce the quality of function calls as they require multiple turn requests. To preserve this context you can use thought signatures. A thought signature is an encrypted representation of the model's internal thought process that you pass back to the model on subsequent turns. To use thought signatures: Receive the signature: When thinking is enabled, the API response will include a thought_signature field containing an encrypted representation of the model's reasoning. Return the signature: When you send the function's execution result back to the server, include the thought_signature you received. This allows the model to restore its previous thinking context and will likely result in better function calling performance. Receiving signatures from the server Signatures are returned in the part after the model's thinking phase, which typically is a text or function call. Here are some examples of what thought signatures look like returned in each type of part, in response to the request "What's the weather in Lake Tahoe?" using the Get Weather example: Text part [{ "candidates" : [ { "content" : { "parts" : [ { "text" : "Here's what the weather in Lake Tahoe is today" , "thoughtSignature" : "ClcBVKhc7ru7KzUI7SrdUoIdAYLm/+i93aHjfIt4xHyAoO/G70tApxnK2ujBhOhC1PrRy1pkQa88fqFvpHNVd1HDjNLO7mkp6/hFwE+SPPEB3fh0hs4oM8MKhgIBVKhc7uIGvrS7i/T4HpfbnYrluFfWNjZ62gewqe4cVdR/Dlh+zbjtYmDD0gPZ+SuBO7vvHQdzsjePRP+2Y5XddX6LEf/cGGgakq8EhVvw/a6IVzUO6XmpHg2Ag1sl8E9+VFH/lC0R0ZuYdFWligtDuYwp5p5q3o59G0TtWeU2MC1y2MJfE9u/KWd313ldka80/X2W/xF2O/4djMp5G2WKcULfve75zeRCy0mc5iS3SB9mTH0cT6x0vtKjeBx50gcg+CQWtJcRuwTVzz54dmvmK9xvnqA8gKGw3DuaM9wfy5hyY7Qg0z3iyyWdP8T/lbjKim8IEQOk7O1vVwP1Ko7oMYH8JgA1CsoBAVSoXO6v4c5RSyd1cn6EIU0pEFQsjW7rYWPuZdOFq/tsGJT9BCfW7KGkPGwlNSq8jTJFvbcJ/DjtndISQYXwiXd2kGa5JfdS2Kh4zOxCxiWtOk+2nCc3+XQk2nonhO+esGJpkDdbbHZSqRgcUtYKq7q28iPFOQvOFyCiZNB7K86Z/6Hnagu2snSlN/BcTMaFGaWpcCClSUo4foRZn3WbNCoM8rcpD7qEJMp4a5baaSxyyeL1ZTGd2HLpFys/oiW6e3oAnhxuIysCwg==" } ] , "role" : "model" } , "index" : 0 } ] , # Remainder of response... Function call part [{ "candidates" : [ { "content" : { "parts" : [ { "functionCall" : { "name" : "getWeather" , "args" : { "city" : "Lake Tahoe" } } , "thoughtSignature" : "CiwBVKhc7nRyTi3HmggPD9iQiRc261f5jwuMdw3H/itDH0emsb9ZVo3Nwx9p6wpsAVSoXO5i8fDV4jBSBLoaWxB5zUdlGY6aIGp+I0oEnwRRSRQ1LOvrDlojEH8JE8HjiKXALdJrvNPiG+HY3GZEO8pZjEZtc3UoBUh7+SVyjK7Xolu7aRYYeUyzrCapoETWypER1jbrJXnFV23hCosBAVSoXO6oIPNJSmbuEDfGafOhuCSHkpr1yjTp35RXYqmCESzRzWf5+nFXLqncqeFo4ohoxbiYQVpVQbOZF81p8o9zg6xeRE7qMeOv+XN7enXGJ4/s3qNFQpfkSMqRdBITN1VpX7jyfEAjvxBNc7PDfDJZmEPY338ZIY5nFFcmzJSWjVrboFt2sMFv+A==" } ] , "role" : "model" } , "finishReason" : "STOP" , "index" : 0 } ] , # Remainder of response... You can confirm that you received a signature and see what a signature looks like using the following code: # Step 2: Call the model with function declarations # ...Generation config, Configure the client, and Define user prompt (No changes) # Send request with declarations (using a thinking model) response = client . models . generate_content ( model = "gemini-2.5-flash" , config = config , contents = contents ) # See thought signatures for part in response . candidates [ 0 ] . content . parts : if part . thought_signature : print ( "Thought signature:" ) print ( part . thought_signature ) Returning signatures back to the server In order to return signatures back: You should return signatures along with their containing parts back to the server You shouldn't merge a part with a signature with another part which also contains a signature. The signature string is not concatenable You shouldn't merge one part with a signature with another part without a signature. This breaks the correct positioning of the thought represented by the signature. The code will remain the same as in Step 4 of the previous section. But in this case (as indicated in the comment below) you will return signatures to the model along with the result of the function execution so the model can incorporate the thoughts into its final response: Python # Step 4: Create user friendly response with function result and call the model again # ...Create a function response part (No change) # Append thought signatures, function call and result of the function execution to contents function_call_content = response . candidates [ 0 ] . content # Append the model's function call message, which includes thought signatures contents . append ( function_call_content ) contents . append ( types . Content ( role = "user" , parts = [ function_response_part ])) # Append the function response final_response = client . models . generate_content ( model = "gemini-2.5-flash" , config = config , contents = contents , ) print ( final_response . text ) JavaScript // Step 4: Create user friendly response with function result and call the model again // ...Create a function response part (No change) // Append thought signatures, function call and result of the function execution to contents const function_response_content = response . candidates [ 0 ]. content ; contents . push ( function_response_content ); contents . push ({ role : 'user' , parts : [{ functionResponse : function_response_part }] }); const final_response = await ai . models . generateContent ({ model : 'gemini-2.5-flash' , contents : contents , config : config }); console . log ( final_response . text ); The following shows what a request returning a thought signature may look like: [{ "contents" : [ { "role" : "user" , "parts" : [ { "text" : "what is the weather in Lake Tahoe?" } ] } , { "parts" : [ { "functionCall" : { "name" : "getWeather" , "args" : { "city" : "Lake Tahoe" } } , "thoughtSignature" : "CiIBVKhc7oDPpCaXyJKKssjqr4g3JNOSgJ/M2V+1THC1icsWCmwBVKhc7pBABbZ+zR3e9234WnWWS6GFXmf8IVwpnzjd5KYd7vyJbn/4vTorWBGayj/vbd9JPaZQjxdAIXhoE5mX/MDsQ7M9N/b0qJjHm39tYIBvS4sIWkMDHqTJqXGLzhhKtrTkfbV3RbaJEkQKmwEBVKhc7qVUgC3hfTXZLo9R3AJzUUIx50NKvJTb9B+UU+LBqgg7Nck1x5OpjWVS2R+SsveprIuYOruk2Y0H53J2OJF8qsxTdIq2si8DGW2V7WK8xyoJH5kbqd7drIw1jLb44b6lx4SMyB0VaULuTBki4d+Ljjg1tJTwR0IYMKqDLDZt9mheINsi0ZxcNjfpnDydRXdWbcSwzmK/wgqJAQFUqFzuKgNVElxs3cbO+xebr2IwcOro84nKTisi0tTp9bICPC9fTUhn3L+rvQWA+d3J1Za8at2bakrqiRj7BTh+CVO9fWQMAEQAs3ni0Z2hfaYG92tOD26E4IoZwyYEoWbfNudpH1fr5tEkyqnEGtWIh7H+XoZQ2DXeiOa+br7Zk88SrNE+trJMCogBAVSoXO5e9fBLg7hnbkmKsrzNLnQtLsQm1gNzjcjEC7nJYklYPp0KI2uGBE1PkM8XNsfllAfHVn7LzHcHNlbQ9pJ7QZTSIeG42goS971r5wNZwxaXwCTphClQh826eqJWo6A/28TtAVQWLhTx5ekbP7qb4nh1UblESZ1saxDQAEo4OKPbDzx5BgqKAQFUqFzuVyjNm5i0wN8hTDnKjfpDroEpPPTs531iFy9BOX+xDCdGHy8D+osFpaoBq6TFekQQbz4hIoUR1YEcP4zI80/cNimEeb9IcFxZTTxiNrbhbbcv0969DSMWhB+ZEqIz4vuw4GLe/xcUvqhlChQwFdgIbdOQHSHpatn5uDlktnP/bi26nKuXIwo0AVSoXO7US22OUH7d1f4abNPI0IyAvhqkPp12rbtWLx9vkOtojE8IP+xCfYtIFuZIzRNZqA==" } ] , "role" : "model" } , { "role" : "user" , "parts" : [ { "functionResponse" : { "name" : "getWeather" , "response" : { "response" : { "stringValue" : "Sunny and hot. 90 degrees Fahrenheit" } } } } ] } ] , # Remainder of request... Learn more about limitations and usage of thought signatures, and about thinking models in general, on the Thinking page. Parallel function calling In addition to single turn function calling, you can also call multiple functions at once. Parallel function calling lets you execute multiple functions at once and is used when the functions are not dependent on each other. This is useful in scenarios like gathering data from multiple independent sources, such as retrieving customer details from different databases or checking inventory levels across various warehouses or performing multiple actions such as converting your apartment into a disco. Python power_disco_ball = { "name" : "power_disco_ball" , "description" : "Powers the spinning disco ball." , "parameters" : { "type" : "object" , "properties" : { "power" : { "type" : "boolean" , "description" : "Whether to turn the disco ball on or off." , } }, "required" : [ "power" ], }, } start_music = { "name" : "start_music" , "description" : "Play some music matching the specified parameters." , "parameters" : { "type" : "object" , "properties" : { "energetic" : { "type" : "boolean" , "description" : "Whether the music is energetic or not." , }, "loud" : { "type" : "boolean" , "description" : "Whether the music is loud or not." , }, }, "required" : [ "energetic" , "loud" ], }, } dim_lights = { "name" : "dim_lights" , "description" : "Dim the lights." , "parameters" : { "type" : "object" , "properties" : { "brightness" : { "type" : "number" , "description" : "The brightness of the lights, 0.0 is off, 1.0 is full." , } }, "required" : [ "brightness" ], }, } JavaScript import { Type } from '@google/genai' ; const powerDiscoBall = { name : 'power_disco_ball' , description : 'Powers the spinning disco ball.' , parameters : { type : Type . OBJECT , properties : { power : { type : Type . BOOLEAN , description : 'Whether to turn the disco ball on or off.' } }, required : [ 'power' ] } }; const startMusic = { name : 'start_music' , description : 'Play some music matching the specified parameters.' , parameters : { type : Type . OBJECT , properties : { energetic : { type : Type . BOOLEAN , description : 'Whether the music is energetic or not.' }, loud : { type : Type . BOOLEAN , description : 'Whether the music is loud or not.' } }, required : [ 'energetic' , 'loud' ] } }; const dimLights = { name : 'dim_lights' , description : 'Dim the lights.' , parameters : { type : Type . OBJECT , properties : { brightness : { type : Type . NUMBER , description : 'The brightness of the lights, 0.0 is off, 1.0 is full.' } }, required : [ 'brightness' ] } }; Configure the function calling mode to allow using all of the specified tools. To learn more, you can read about configuring function calling . Python from google import genai from google.genai import types # Configure the client and tools client = genai . Client () house_tools = [ types . Tool ( function_declarations = [ power_disco_ball , start_music , dim_lights ]) ] config = types . GenerateContentConfig ( tools = house_tools , automatic_function_calling = types . AutomaticFunctionCallingConfig ( disable = True ), # Force the model to call 'any' function, instead of chatting. tool_config = types . ToolConfig ( function_calling_config = types . FunctionCallingConfig ( mode = 'ANY' ) ), ) chat = client . chats . create ( model = "gemini-2.5-flash" , config = config ) response = chat . send_message ( "Turn this place into a party!" ) # Print out each of the function calls requested from this single call print ( "Example 1: Forced function calling" ) for fn in response . function_calls : args = ", " . join ( f " { key } = { val } " for key , val in fn . args . items ()) print ( f " { fn . name } ( { args } )" ) JavaScript import { GoogleGenAI } from '@google/genai' ; // Set up function declarations const houseFns = [ powerDiscoBall , startMusic , dimLights ]; const config = { tools : [{ functionDeclarations : houseFns }], // Force the model to call 'any' function, instead of chatting. toolConfig : { functionCallingConfig : { mode : 'any' } } }; // Configure the client const ai = new GoogleGenAI ({}); // Create a chat session const chat = ai . chats . create ({ model : 'gemini-2.5-flash' , config : config }); const response = await chat . sendMessage ({ message : 'Turn this place into a party!' }); // Print out each of the function calls requested from this single call console . log ( "Example 1: Forced function calling" ); for ( const fn of response . functionCalls ) { const args = Object . entries ( fn . args ) . map (([ key , val ]) = > ` ${ key } = ${ val } ` ) . join ( ', ' ); console . log ( ` ${ fn . name } ( ${ args } )` ); } Each of the printed results reflects a single function call that the model has requested. To send the results back, include the responses in the same order as they were requested. The Python SDK supports automatic function calling , which automatically converts Python functions to declarations, handles the function call execution and response cycle for you. Following is an example for the disco use case. Note: Automatic Function Calling is a Python SDK only feature at the moment. Python from google import genai from google.genai import types # Actual function implementations def power_disco_ball_impl ( power : bool ) - > dict : """Powers the spinning disco ball. Args: power: Whether to turn the disco ball on or off. Returns: A status dictionary indicating the current state. """ return { "status" : f "Disco ball powered { 'on' if power else 'off' } " } def start_music_impl ( energetic : bool , loud : bool ) - > dict : """Play some music matching the specified parameters. Args: energetic: Whether the music is energetic or not. loud: Whether the music is loud or not. Returns: A dictionary containing the music settings. """ music_type = "energetic" if energetic else "chill" volume = "loud" if loud else "quiet" return { "music_type" : music_type , "volume" : volume } def dim_lights_impl ( brightness : float ) - > dict : """Dim the lights. Args: brightness: The brightness of the lights, 0.0 is off, 1.0 is full. Returns: A dictionary containing the new brightness setting. """ return { "brightness" : brightness } # Configure the client client = genai . Client () config = types . GenerateContentConfig ( tools = [ power_disco_ball_impl , start_music_impl , dim_lights_impl ] ) # Make the request response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = "Do everything you need to this place into party!" , config = config , ) print ( " \n Example 2: Automatic function calling" ) print ( response . text ) # I've turned on the disco ball, started playing loud and energetic music, and dimmed the lights to 50% brightness. Let's get this party started! Compositional function calling Compositional or sequential function calling allows Gemini to chain multiple function calls together to fulfill a complex request. For example, to answer "Get the temperature in my current location", the Gemini API might first invoke a get_current_location() function followed by a get_weather() function that takes the location as a parameter. The following example demonstrates how to implement compositional function calling using the Python SDK and automatic function calling. Python This example uses the automatic function calling feature of the google-genai Python SDK. The SDK automatically converts the Python functions to the required schema, executes the function calls when requested by the model, and sends the results back to the model to complete the task. import os from google import genai from google.genai import types # Example Functions def get_weather_forecast ( location : str ) - > dict : """Gets the current weather temperature for a given location.""" print ( f "Tool Call: get_weather_forecast(location= { location } )" ) # TODO: Make API call print ( "Tool Response: {'temperature': 25, 'unit': 'celsius'}" ) return { "temperature" : 25 , "unit" : "celsius" } # Dummy response def set_thermostat_temperature ( temperature : int ) - > dict : """Sets the thermostat to a desired temperature.""" print ( f "Tool Call: set_thermostat_temperature(temperature= { temperature } )" ) # TODO: Interact with a thermostat API print ( "Tool Response: {'status': 'success'}" ) return { "status" : "success" } # Configure the client and model client = genai . Client () config = types . GenerateContentConfig ( tools = [ get_weather_forecast , set_thermostat_temperature ] ) # Make the request response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = "If it's warmer than 20°C in London, set the thermostat to 20°C, otherwise set it to 18°C." , config = config , ) # Print the final, user-facing response print ( response . text ) Expected Output When you run the code, you will see the SDK orchestrating the function calls. The model first calls get_weather_forecast , receives the temperature, and then calls set_thermostat_temperature with the correct value based on the logic in the prompt. Tool Call : get_weather_forecast ( location = London ) Tool Response : { 'temperature' : 25 , 'unit' : 'celsius' } Tool Call : set_thermostat_temperature ( temperature = 20 ) Tool Response : { 'status' : 'success' } OK . I 've set the thermostat to 20°C. JavaScript This example shows how to use JavaScript/TypeScript SDK to do comopositional function calling using a manual execution loop. import { GoogleGenAI , Type } from "@google/genai" ; // Configure the client const ai = new GoogleGenAI ({}); // Example Functions function get_weather_forecast ({ location }) { console . log ( `Tool Call: get_weather_forecast(location= ${ location } )` ); // TODO: Make API call console . log ( "Tool Response: {'temperature': 25, 'unit': 'celsius'}" ); return { temperature : 25 , unit : "celsius" }; } function set_thermostat_temperature ({ temperature }) { console . log ( `Tool Call: set_thermostat_temperature(temperature= ${ temperature } )` , ); // TODO: Make API call console . log ( "Tool Response: {'status': 'success'}" ); return { status : "success" }; } const toolFunctions = { get_weather_forecast , set_thermostat_temperature , }; const tools = [ { functionDeclarations : [ { name : "get_weather_forecast" , description : "Gets the current weather temperature for a given location." , parameters : { type : Type . OBJECT , properties : { location : { type : Type . STRING , }, }, required : [ "location" ], }, }, { name : "set_thermostat_temperature" , description : "Sets the thermostat to a desired temperature." , parameters : { type : Type . OBJECT , properties : { temperature : { type : Type . NUMBER , }, }, required : [ "temperature" ], }, }, ], }, ]; // Prompt for the model let contents = [ { role : "user" , parts : [ { text : "If it's warmer than 20°C in London, set the thermostat to 20°C, otherwise set it to 18°C." , }, ], }, ]; // Loop until the model has no more function calls to make while ( true ) { const result = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents , config : { tools }, }); if ( result . functionCalls && result . functionCalls . length > 0 ) { const functionCall = result . functionCalls [ 0 ]; const { name , args } = functionCall ; if ( ! toolFunctions [ name ]) { throw new Error ( `Unknown function call: ${ name } ` ); } // Call the function and get the response. const toolResponse = toolFunctions [ name ]( args ); const functionResponsePart = { name : functionCall . name , response : { result : toolResponse , }, }; // Send the function response back to the model. contents . push ({ role : "model" , parts : [ { functionCall : functionCall , }, ], }); contents . push ({ role : "user" , parts : [ { functionResponse : functionResponsePart , }, ], }); } else { // No more function calls, break the loop. console . log ( result . text ); break ; } } Expected Output When you run the code, you will see the SDK orchestrating the function calls. The model first calls get_weather_forecast , receives the temperature, and then calls set_thermostat_temperature with the correct value based on the logic in the prompt. Tool Call : get_weather_forecast ( location = London ) Tool Response : { 'temperature' : 25 , 'unit' : 'celsius' } Tool Call : set_thermostat_temperature ( temperature = 20 ) Tool Response : { 'status' : 'success' } OK . It 's 25°C in London, so I' ve set the thermostat to 20 ° C . Compositional function calling is a native Live API feature. This means Live API can handle the function calling similar to the Python SDK. Python # Light control schemas turn_on_the_lights_schema = { 'name' : 'turn_on_the_lights' } turn_off_the_lights_schema = { 'name' : 'turn_off_the_lights' } prompt = """ Hey, can you write run some python code to turn on the lights, wait 10s and then turn off the lights? """ tools = [ { 'code_execution' : {}}, { 'function_declarations' : [ turn_on_the_lights_schema , turn_off_the_lights_schema ]} ] await run ( prompt , tools = tools , modality = "AUDIO" ) JavaScript // Light control schemas const turnOnTheLightsSchema = { name : 'turn_on_the_lights' }; const turnOffTheLightsSchema = { name : 'turn_off_the_lights' }; const prompt = ` Hey, can you write run some python code to turn on the lights, wait 10s and then turn off the lights? ` ; const tools = [ { codeExecution : {} }, { functionDeclarations : [ turnOnTheLightsSchema , turnOffTheLightsSchema ] } ]; await run ( prompt , tools = tools , modality = "AUDIO" ) Function calling modes The Gemini API lets you control how the model uses the provided tools (function declarations). Specifically, you can set the mode within the. function_calling_config . AUTO (Default) : The model decides whether to generate a natural language response or suggest a function call based on the prompt and context. This is the most flexible mode and recommended for most scenarios. ANY : The model is constrained to always predict a function call and guarantees function schema adherence. If allowed_function_names is not specified, the model can choose from any of the provided function declarations. If allowed_function_names is provided as a list, the model can only choose from the functions in that list. Use this mode when you require a function call response to every prompt (if applicable). NONE : The model is prohibited from making function calls. This is equivalent to sending a request without any function declarations. Use this to temporarily disable function calling without removing your tool definitions. Python from google.genai import types # Configure function calling mode tool_config = types . ToolConfig ( function_calling_config = types . FunctionCallingConfig ( mode = "ANY" , allowed_function_names = [ "get_current_temperature" ] ) ) # Create the generation config config = types . GenerateContentConfig ( tools = [ tools ], # not defined here. tool_config = tool_config , ) JavaScript import { FunctionCallingConfigMode } from '@google/genai' ; // Configure function calling mode const toolConfig = { functionCallingConfig : { mode : FunctionCallingConfigMode . ANY , allowedFunctionNames : [ 'get_current_temperature' ] } }; // Create the generation config const config = { tools : tools , // not defined here. toolConfig : toolConfig , }; Automatic function calling (Python only) When using the Python SDK, you can provide Python functions directly as tools. The SDK automatically converts the Python function to declarations, handles the function call execution and the response cycle for you. The Python SDK then automatically: Detects function call responses from the model. Call the corresponding Python function in your code. Sends the function response back to the model. Returns the model's final text response. To use this, define your function with type hints and a docstring, and then pass the function itself (not a JSON declaration) as a tool: Python from google import genai from google.genai import types # Define the function with type hints and docstring def get_current_temperature ( location : str ) - > dict : """Gets the current temperature for a given location. Args: location: The city and state, e.g. San Francisco, CA Returns: A dictionary containing the temperature and unit. """ # ... (implementation) ... return { "temperature" : 25 , "unit" : "Celsius" } # Configure the client client = genai . Client () config = types . GenerateContentConfig ( tools = [ get_current_temperature ] ) # Pass the function itself # Make the request response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = "What's the temperature in Boston?" , config = config , ) print ( response . text ) # The SDK handles the function call and returns the final text You can disable automatic function calling with: Python config = types . GenerateContentConfig ( tools = [ get_current_temperature ], automatic_function_calling = types . AutomaticFunctionCallingConfig ( disable = True ) ) Automatic function schema declaration Automatic schema extraction from Python functions doesn't work in all cases. For example, it doesn't handle cases where you describe the fields of a nested dictionary-object. The API is able to describe any of the following types: Python AllowedType = ( int | float | bool | str | list [ 'AllowedType' ] | dict [ str , AllowedType ]) To see what the inferred schema looks like, you can convert it using from_callable : Python def multiply ( a : float , b : float ): """Returns a * b.""" return a * b fn_decl = types . FunctionDeclaration . from_callable ( callable = multiply , client = client ) # to_json_dict() provides a clean JSON representation. print ( fn_decl . to_json_dict ()) Multi-tool use: Combine native tools with function calling You can enable multiple tools combining native tools with function calling at the same time. Here's an example that enables two tools, Grounding with Google Search and code execution , in a request using the Live API . Note: Multi-tool use is a- Live API only feature at the moment. The run() function declaration, which handles the asynchronous websocket setup, is omitted for brevity. Python # Multiple tasks example - combining lights, code execution, and search prompt = """ Hey, I need you to do three things for me. 1. Turn on the lights. 2. Then compute the largest prime palindrome under 100000. 3. Then use Google Search to look up information about the largest earthquake in California the week of Dec 5 2024. Thanks! """ tools = [ { 'google_search' : {}}, { 'code_execution' : {}}, { 'function_declarations' : [ turn_on_the_lights_schema , turn_off_the_lights_schema ]} # not defined here. ] # Execute the prompt with specified tools in audio modality await run ( prompt , tools = tools , modality = "AUDIO" ) JavaScript // Multiple tasks example - combining lights, code execution, and search const prompt = ` Hey, I need you to do three things for me. 1. Turn on the lights. 2. Then compute the largest prime palindrome under 100000. 3. Then use Google Search to look up information about the largest earthquake in California the week of Dec 5 2024. Thanks! ` ; const tools = [ { googleSearch : {} }, { codeExecution : {} }, { functionDeclarations : [ turnOnTheLightsSchema , turnOffTheLightsSchema ] } // not defined here. ]; // Execute the prompt with specified tools in audio modality await run ( prompt , { tools : tools , modality : "AUDIO" }); Python developers can try this out in the Live API Tool Use notebook . Model context protocol (MCP) Model Context Protocol (MCP) is an open standard for connecting AI applications with external tools and data. MCP provides a common protocol for models to access context, such as functions (tools), data sources (resources), or predefined prompts. The Gemini SDKs have built-in support for the MCP, reducing boilerplate code and offering automatic tool calling for MCP tools. When the model generates an MCP tool call, the Python and JavaScript client SDK can automatically execute the MCP tool and send the response back to the model in a subsequent request, continuing this loop until no more tool calls are made by the model. Here, you can find an example of how to use a local MCP server with Gemini and mcp SDK. Python Make sure the latest version of the mcp SDK is installed on your platform of choice. pip install mcp Note: Python supports automatic tool calling by passing in the ClientSession into the tools parameters. If you want to disable it, you can provide automatic_function_calling with disabled True . import os import asyncio from datetime import datetime from mcp import ClientSession , StdioServerParameters from mcp.client.stdio import stdio_client from google import genai client = genai . Client () # Create server parameters for stdio connection server_params = StdioServerParameters ( command = "npx" , # Executable args = [ "-y" , "@philschmid/weather-mcp" ], # MCP Server env = None , # Optional environment variables ) async def run (): async with stdio_client ( server_params ) as ( read , write ): async with ClientSession ( read , write ) as session : # Prompt to get the weather for the current day in London. prompt = f "What is the weather in London in { datetime . now () . strftime ( '%Y-%m- %d ' ) } ?" # Initialize the connection between client and server await session . initialize () # Send request to the model with MCP function declarations response = await client . aio . models . generate_content ( model = "gemini-2.5-flash" , contents = prompt , config = genai . types . GenerateContentConfig ( temperature = 0 , tools = [ session ], # uses the session, will automatically call the tool # Uncomment if you **don't** want the SDK to automatically call the tool # automatic_function_calling=genai.types.AutomaticFunctionCallingConfig( # disable=True # ), ), ) print ( response . text ) # Start the asyncio event loop and run the main function asyncio . run ( run ()) JavaScript Make sure the latest version of the mcp SDK is installed on your platform of choice. npm install @modelcontextprotocol/sdk Note: JavaScript supports automatic tool calling by wrapping the client with mcpToTool . If you want to disable it, you can provide automaticFunctionCalling with disabled true . import { GoogleGenAI , FunctionCallingConfigMode , mcpToTool } from '@google/genai' ; import { Client } from "@modelcontextprotocol/sdk/client/index.js" ; import { StdioClientTransport } from "@modelcontextprotocol/sdk/client/stdio.js" ; // Create server parameters for stdio connection const serverParams = new StdioClientTransport ({ command : "npx" , // Executable args : [ "-y" , "@philschmid/weather-mcp" ] // MCP Server }); const client = new Client ( { name : "example-client" , version : "1.0.0" } ); // Configure the client const ai = new GoogleGenAI ({}); // Initialize the connection between client and server await client . connect ( serverParams ); // Send request to the model with MCP tools const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : `What is the weather in London in ${ new Date (). toLocaleDateString () } ?` , config : { tools : [ mcpToTool ( client )], // uses the session, will automatically call the tool // Uncomment if you **don't** want the sdk to automatically call the tool // automaticFunctionCalling: { // disable: true, // }, }, }); console . log ( response . text ) // Close the connection await client . close (); Limitations with built-in MCP support Built-in MCP support is a experimental feature in our SDKs and has the following limitations: Only tools are supported, not resources nor prompts It is available for the Python and JavaScript/TypeScript SDK. Breaking changes might occur in future releases. Manual integration of MCP servers is always an option if these limit what you're building. Supported models This section lists models and their function calling capabilities. Experimental models are not included. You can find a comprehensive capabilities overview on the model overview page. Model Function Calling Parallel Function Calling Compositional Function Calling Gemini 2.5 Pro ✔️ ✔️ ✔️ Gemini 2.5 Flash ✔️ ✔️ ✔️ Gemini 2.5 Flash-Lite ✔️ ✔️ ✔️ Gemini 2.0 Flash ✔️ ✔️ ✔️ Gemini 2.0 Flash-Lite X X X Best practices Function and Parameter Descriptions: Be extremely clear and specific in your descriptions. The model relies on these to choose the correct function and provide appropriate arguments. Naming: Use descriptive function names (without spaces, periods, or dashes). Strong Typing: Use specific types (integer, string, enum) for parameters to reduce errors. If a parameter has a limited set of valid values, use an enum. Tool Selection: While the model can use an arbitrary number of tools, providing too many can increase the risk of selecting an incorrect or suboptimal tool. For best results, aim to provide only the relevant tools for the context or task, ideally keeping the active set to a maximum of 10-20. Consider dynamic tool selection based on conversation context if you have a large total number of tools. Prompt Engineering: Provide context: Tell the model its role (e.g., "You are a helpful weather assistant."). Give instructions: Specify how and when to use functions (e.g., "Don't guess dates; always use a future date for forecasts."). Encourage clarification: Instruct the model to ask clarifying questions if needed. Temperature: Use a low temperature (e.g., 0) for more deterministic and reliable function calls. Validation: If a function call has significant consequences (e.g., placing an order), validate the call with the user before executing it. Error Handling : Implement robust error handling in your functions to gracefully handle unexpected inputs or API failures. Return informative error messages that the model can use to generate helpful responses to the user. Security: Be mindful of security when calling external APIs. Use appropriate authentication and authorization mechanisms. Avoid exposing sensitive data in function calls. Token Limits: Function descriptions and parameters count towards your input token limit. If you're hitting token limits, consider limiting the number of functions or the length of the descriptions, break down complex tasks into smaller, more focused function sets. Notes and limitations Only a subset of the OpenAPI schema is supported. Supported parameter types in Python are limited. Automatic function calling is a Python SDK feature only. Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-10 UTC. \ No newline at end of file diff --git a/text_content/docs_image-understanding_5d89bbde.txt b/text_content/docs_image-understanding_5d89bbde.txt new file mode 100644 index 0000000000000000000000000000000000000000..bafda995ddf15fc1ad603b01f91c859b9f524cc0 --- /dev/null +++ b/text_content/docs_image-understanding_5d89bbde.txt @@ -0,0 +1,5 @@ +URL: https://ai.google.dev/gemini-api/docs/image-understanding#main-content +Title: Image understanding  |  Gemini API  |  Google AI for Developers +================================================== + +Image understanding | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Image understanding Gemini models are built to be multimodal from the ground up, unlocking a wide range of image processing and computer vision tasks including but not limited to image captioning, classification, and visual question answering without having to train specialized ML models. Tip: In addition to their general multimodal capabilities, Gemini models (2.0 and newer) offer improved accuracy for specific use cases like object detection and segmentation , through additional training. See the Capabilities section for more details. Passing images to Gemini You can provide images as input to Gemini using two methods: Passing inline image data : Ideal for smaller files (total request size less than 20MB, including prompts). Uploading images using the File API : Recommended for larger files or for reusing images across multiple requests. Passing inline image data You can pass inline image data in the request to generateContent . You can provide image data as Base64 encoded strings or by reading local files directly (depending on the language). The following example shows how to read an image from a local file and pass it to generateContent API for processing. Python from google.genai import types with open ( 'path/to/small-sample.jpg' , 'rb' ) as f : image_bytes = f . read () response = client . models . generate_content ( model = 'gemini-2.5-flash' , contents = [ types . Part . from_bytes ( data = image_bytes , mime_type = 'image/jpeg' , ), 'Caption this image.' ] ) print ( response . text ) JavaScript import { GoogleGenAI } from "@google/genai" ; import * as fs from "node:fs" ; const ai = new GoogleGenAI ({}); const base64ImageFile = fs . readFileSync ( "path/to/small-sample.jpg" , { encoding : "base64" , }); const contents = [ { inlineData : { mimeType : "image/jpeg" , data : base64ImageFile , }, }, { text : "Caption this image." }, ]; const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : contents , }); console . log ( response . text ); Go bytes , _ := os . ReadFile ( "path/to/small-sample.jpg" ) parts := [] * genai . Part { genai . NewPartFromBytes ( bytes , "image/jpeg" ), genai . NewPartFromText ( "Caption this image." ), } contents := [] * genai . Content { genai . NewContentFromParts ( parts , genai . RoleUser ), } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , contents , nil , ) fmt . Println ( result . Text ()) REST IMG_PATH = "/path/to/your/image1.jpg" if [[ " $( base64 --version 2>&1 ) " = * "FreeBSD" * ]] ; then B64FLAGS = "--input" else B64FLAGS = "-w0" fi curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [{ "parts":[ { "inline_data": { "mime_type":"image/jpeg", "data": "' " $( base64 $B64FLAGS $IMG_PATH ) " '" } }, {"text": "Caption this image."}, ] }] }' 2 > /dev/null You can also fetch an image from a URL, convert it to bytes, and pass it to generateContent as shown in the following examples. Python from google import genai from google.genai import types import requests image_path = "https://goo.gle/instrument-img" image_bytes = requests . get ( image_path ) . content image = types . Part . from_bytes ( data = image_bytes , mime_type = "image/jpeg" ) client = genai . Client () response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = [ "What is this image?" , image ], ) print ( response . text ) JavaScript import { GoogleGenAI } from "@google/genai" ; async function main () { const ai = new GoogleGenAI ({}); const imageUrl = "https://goo.gle/instrument-img" ; const response = await fetch ( imageUrl ); const imageArrayBuffer = await response . arrayBuffer (); const base64ImageData = Buffer . from ( imageArrayBuffer ). toString ( 'base64' ); const result = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : [ { inlineData : { mimeType : 'image/jpeg' , data : base64ImageData , }, }, { text : "Caption this image." } ], }); console . log ( result . text ); } main (); Go package main import ( "context" "fmt" "os" "io" "net/http" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } // Download the image. imageResp , _ := http . Get ( "https://goo.gle/instrument-img" ) imageBytes , _ := io . ReadAll ( imageResp . Body ) parts := [] * genai . Part { genai . NewPartFromBytes ( imageBytes , "image/jpeg" ), genai . NewPartFromText ( "Caption this image." ), } contents := [] * genai . Content { genai . NewContentFromParts ( parts , genai . RoleUser ), } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , contents , nil , ) fmt . Println ( result . Text ()) } REST IMG_URL = "https://goo.gle/instrument-img" MIME_TYPE = $( curl -sIL " $IMG_URL " | grep -i '^content-type:' | awk -F ': ' '{print $2}' | sed 's/\r$//' | head -n 1 ) if [[ -z " $MIME_TYPE " || ! " $MIME_TYPE " == image/* ]] ; then MIME_TYPE = "image/jpeg" fi # Check for macOS if [[ " $( uname ) " == "Darwin" ]] ; then IMAGE_B64 = $( curl -sL " $IMG_URL " | base64 -b 0 ) elif [[ " $( base64 --version 2>&1 ) " = * "FreeBSD" * ]] ; then IMAGE_B64 = $( curl -sL " $IMG_URL " | base64 ) else IMAGE_B64 = $( curl -sL " $IMG_URL " | base64 -w0 ) fi curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [{ "parts":[ { "inline_data": { "mime_type":"' " $MIME_TYPE " '", "data": "' " $IMAGE_B64 " '" } }, {"text": "Caption this image."} ] }] }' 2 > /dev/null Note: Inline image data limits your total request size (text prompts, system instructions, and inline bytes) to 20MB. For larger requests, upload image files using the File API. Files API is also more efficient for scenarios that use the same image repeatedly. Uploading images using the File API For large files or to be able to use the same image file repeatedly, use the Files API. The following code uploads an image file and then uses the file in a call to generateContent . See the Files API guide for more information and examples. Python from google import genai client = genai . Client () my_file = client . files . upload ( file = "path/to/sample.jpg" ) response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = [ my_file , "Caption this image." ], ) print ( response . text ) JavaScript import { GoogleGenAI , createUserContent , createPartFromUri , } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const myfile = await ai . files . upload ({ file : "path/to/sample.jpg" , config : { mimeType : "image/jpeg" }, }); const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : createUserContent ([ createPartFromUri ( myfile . uri , myfile . mimeType ), "Caption this image." , ]), }); console . log ( response . text ); } await main (); Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } uploadedFile , _ := client . Files . UploadFromPath ( ctx , "path/to/sample.jpg" , nil ) parts := [] * genai . Part { genai . NewPartFromText ( "Caption this image." ), genai . NewPartFromURI ( uploadedFile . URI , uploadedFile . MIMEType ), } contents := [] * genai . Content { genai . NewContentFromParts ( parts , genai . RoleUser ), } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , contents , nil , ) fmt . Println ( result . Text ()) } REST IMAGE_PATH = "path/to/sample.jpg" MIME_TYPE = $( file -b --mime-type " ${ IMAGE_PATH } " ) NUM_BYTES = $( wc -c < " ${ IMAGE_PATH } " ) DISPLAY_NAME = IMAGE tmp_header_file = upload-header.tmp # Initial resumable request defining metadata. # The upload url is in the response headers dump them to a file. curl "https://generativelanguage.googleapis.com/upload/v1beta/files" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -D upload-header.tmp \ -H "X-Goog-Upload-Protocol: resumable" \ -H "X-Goog-Upload-Command: start" \ -H "X-Goog-Upload-Header-Content-Length: ${ NUM_BYTES } " \ -H "X-Goog-Upload-Header-Content-Type: ${ MIME_TYPE } " \ -H "Content-Type: application/json" \ -d "{'file': {'display_name': ' ${ DISPLAY_NAME } '}}" 2 > /dev/null upload_url = $( grep -i "x-goog-upload-url: " " ${ tmp_header_file } " | cut -d " " -f2 | tr -d "\r" ) rm " ${ tmp_header_file } " # Upload the actual bytes. curl " ${ upload_url } " \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H "Content-Length: ${ NUM_BYTES } " \ -H "X-Goog-Upload-Offset: 0" \ -H "X-Goog-Upload-Command: upload, finalize" \ --data-binary "@ ${ IMAGE_PATH } " 2 > /dev/null > file_info.json file_uri = $( jq -r ".file.uri" file_info.json ) echo file_uri = $file_uri # Now generate content using that file curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [{ "parts":[ {"file_data":{"mime_type": "' " ${ MIME_TYPE } " '", "file_uri": "' " ${ file_uri } " '"}}, {"text": "Caption this image."}] }] }' 2 > /dev/null > response.json cat response.json echo jq ".candidates[].content.parts[].text" response.json Prompting with multiple images You can provide multiple images in a single prompt by including multiple image Part objects in the contents array. These can be a mix of inline data (local files or URLs) and File API references. Python from google import genai from google.genai import types client = genai . Client () # Upload the first image image1_path = "path/to/image1.jpg" uploaded_file = client . files . upload ( file = image1_path ) # Prepare the second image as inline data image2_path = "path/to/image2.png" with open ( image2_path , 'rb' ) as f : img2_bytes = f . read () # Create the prompt with text and multiple images response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = [ "What is different between these two images?" , uploaded_file , # Use the uploaded file reference types . Part . from_bytes ( data = img2_bytes , mime_type = 'image/png' ) ] ) print ( response . text ) JavaScript import { GoogleGenAI , createUserContent , createPartFromUri , } from "@google/genai" ; import * as fs from "node:fs" ; const ai = new GoogleGenAI ({}); async function main () { // Upload the first image const image1_path = "path/to/image1.jpg" ; const uploadedFile = await ai . files . upload ({ file : image1_path , config : { mimeType : "image/jpeg" }, }); // Prepare the second image as inline data const image2_path = "path/to/image2.png" ; const base64Image2File = fs . readFileSync ( image2_path , { encoding : "base64" , }); // Create the prompt with text and multiple images const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : createUserContent ([ "What is different between these two images?" , createPartFromUri ( uploadedFile . uri , uploadedFile . mimeType ), { inlineData : { mimeType : "image/png" , data : base64Image2File , }, }, ]), }); console . log ( response . text ); } await main (); Go // Upload the first image image1Path := "path/to/image1.jpg" uploadedFile , _ := client . Files . UploadFromPath ( ctx , image1Path , nil ) // Prepare the second image as inline data image2Path := "path/to/image2.jpeg" imgBytes , _ := os . ReadFile ( image2Path ) parts := [] * genai . Part { genai . NewPartFromText ( "What is different between these two images?" ), genai . NewPartFromBytes ( imgBytes , "image/jpeg" ), genai . NewPartFromURI ( uploadedFile . URI , uploadedFile . MIMEType ), } contents := [] * genai . Content { genai . NewContentFromParts ( parts , genai . RoleUser ), } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , contents , nil , ) fmt . Println ( result . Text ()) REST # Upload the first image IMAGE1_PATH = "path/to/image1.jpg" MIME1_TYPE = $( file -b --mime-type " ${ IMAGE1_PATH } " ) NUM1_BYTES = $( wc -c < " ${ IMAGE1_PATH } " ) DISPLAY_NAME1 = IMAGE1 tmp_header_file1 = upload-header1.tmp curl "https://generativelanguage.googleapis.com/upload/v1beta/files" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -D upload-header1.tmp \ -H "X-Goog-Upload-Protocol: resumable" \ -H "X-Goog-Upload-Command: start" \ -H "X-Goog-Upload-Header-Content-Length: ${ NUM1_BYTES } " \ -H "X-Goog-Upload-Header-Content-Type: ${ MIME1_TYPE } " \ -H "Content-Type: application/json" \ -d "{'file': {'display_name': ' ${ DISPLAY_NAME1 } '}}" 2 > /dev/null upload_url1 = $( grep -i "x-goog-upload-url: " " ${ tmp_header_file1 } " | cut -d " " -f2 | tr -d "\r" ) rm " ${ tmp_header_file1 } " curl " ${ upload_url1 } " \ -H "Content-Length: ${ NUM1_BYTES } " \ -H "X-Goog-Upload-Offset: 0" \ -H "X-Goog-Upload-Command: upload, finalize" \ --data-binary "@ ${ IMAGE1_PATH } " 2 > /dev/null > file_info1.json file1_uri = $( jq ".file.uri" file_info1.json ) echo file1_uri = $file1_uri # Prepare the second image (inline) IMAGE2_PATH = "path/to/image2.png" MIME2_TYPE = $( file -b --mime-type " ${ IMAGE2_PATH } " ) if [[ " $( base64 --version 2>&1 ) " = * "FreeBSD" * ]] ; then B64FLAGS = "--input" else B64FLAGS = "-w0" fi IMAGE2_BASE64 = $( base64 $B64FLAGS $IMAGE2_PATH ) # Now generate content using both images curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [{ "parts":[ {"text": "What is different between these two images?"}, {"file_data":{"mime_type": "' " ${ MIME1_TYPE } " '", "file_uri": ' $file1_uri '}}, { "inline_data": { "mime_type":"' " ${ MIME2_TYPE } " '", "data": "' " $IMAGE2_BASE64 " '" } } ] }] }' 2 > /dev/null > response.json cat response.json echo jq ".candidates[].content.parts[].text" response.json Object detection From Gemini 2.0 onwards, models are further trained to detect objects in an image and get their bounding box coordinates. The coordinates, relative to image dimensions, scale to [0, 1000]. You need to descale these coordinates based on your original image size. Python from google import genai from google.genai import types from PIL import Image import json client = genai . Client () prompt = "Detect the all of the prominent items in the image. The box_2d should be [ymin, xmin, ymax, xmax] normalized to 0-1000." image = Image . open ( "/path/to/image.png" ) config = types . GenerateContentConfig ( response_mime_type = "application/json" ) response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = [ image , prompt ], config = config ) width , height = image . size bounding_boxes = json . loads ( response . text ) converted_bounding_boxes = [] for bounding_box in bounding_boxes : abs_y1 = int ( bounding_box [ "box_2d" ][ 0 ] / 1000 * height ) abs_x1 = int ( bounding_box [ "box_2d" ][ 1 ] / 1000 * width ) abs_y2 = int ( bounding_box [ "box_2d" ][ 2 ] / 1000 * height ) abs_x2 = int ( bounding_box [ "box_2d" ][ 3 ] / 1000 * width ) converted_bounding_boxes . append ([ abs_x1 , abs_y1 , abs_x2 , abs_y2 ]) print ( "Image size: " , width , height ) print ( "Bounding boxes:" , converted_bounding_boxes ) Note: The model also supports generating bounding boxes based on custom instructions, such as: "Show bounding boxes of all green objects in this image". It also support custom labels like "label the items with the allergens they can contain". For more examples, check following notebooks in the Gemini Cookbook : 2D spatial understanding notebook Experimental 3D pointing notebook Segmentation Starting with Gemini 2.5, models not only detect items but also segment them and provide their contour masks. The model predicts a JSON list, where each item represents a segmentation mask. Each item has a bounding box (" box_2d ") in the format [y0, x0, y1, x1] with normalized coordinates between 0 and 1000, a label (" label ") that identifies the object, and finally the segmentation mask inside the bounding box, as base64 encoded png that is a probability map with values between 0 and 255. The mask needs to be resized to match the bounding box dimensions, then binarized at your confidence threshold (127 for the midpoint). Note: For better results, disable thinking by setting the thinking budget to 0. See code sample below for an example. Python from google import genai from google.genai import types from PIL import Image , ImageDraw import io import base64 import json import numpy as np import os client = genai . Client () def parse_json ( json_output : str ): # Parsing out the markdown fencing lines = json_output . splitlines () for i , line in enumerate ( lines ): if line == "```json" : json_output = " \n " . join ( lines [ i + 1 :]) # Remove everything before "```json" output = json_output . split ( "```" )[ 0 ] # Remove everything after the closing "```" break # Exit the loop once "```json" is found return json_output def extract_segmentation_masks ( image_path : str , output_dir : str = "segmentation_outputs" ): # Load and resize image im = Image . open ( image_path ) im . thumbnail ([ 1024 , 1024 ], Image . Resampling . LANCZOS ) prompt = """ Give the segmentation masks for the wooden and glass items. Output a JSON list of segmentation masks where each entry contains the 2D bounding box in the key "box_2d", the segmentation mask in key "mask", and the text label in the key "label". Use descriptive labels. """ config = types . GenerateContentConfig ( thinking_config = types . ThinkingConfig ( thinking_budget = 0 ) # set thinking_budget to 0 for better results in object detection ) response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = [ prompt , im ], # Pillow images can be directly passed as inputs (which will be converted by the SDK) config = config ) # Parse JSON response items = json . loads ( parse_json ( response . text )) # Create output directory os . makedirs ( output_dir , exist_ok = True ) # Process each mask for i , item in enumerate ( items ): # Get bounding box coordinates box = item [ "box_2d" ] y0 = int ( box [ 0 ] / 1000 * im . size [ 1 ]) x0 = int ( box [ 1 ] / 1000 * im . size [ 0 ]) y1 = int ( box [ 2 ] / 1000 * im . size [ 1 ]) x1 = int ( box [ 3 ] / 1000 * im . size [ 0 ]) # Skip invalid boxes if y0 > = y1 or x0 > = x1 : continue # Process mask png_str = item [ "mask" ] if not png_str . startswith ( "data:image/png;base64," ): continue # Remove prefix png_str = png_str . removeprefix ( "data:image/png;base64," ) mask_data = base64 . b64decode ( png_str ) mask = Image . open ( io . BytesIO ( mask_data )) # Resize mask to match bounding box mask = mask . resize (( x1 - x0 , y1 - y0 ), Image . Resampling . BILINEAR ) # Convert mask to numpy array for processing mask_array = np . array ( mask ) # Create overlay for this mask overlay = Image . new ( 'RGBA' , im . size , ( 0 , 0 , 0 , 0 )) overlay_draw = ImageDraw . Draw ( overlay ) # Create overlay for the mask color = ( 255 , 255 , 255 , 200 ) for y in range ( y0 , y1 ): for x in range ( x0 , x1 ): if mask_array [ y - y0 , x - x0 ] > 128 : # Threshold for mask overlay_draw . point (( x , y ), fill = color ) # Save individual mask and its overlay mask_filename = f " { item [ 'label' ] } _ { i } _mask.png" overlay_filename = f " { item [ 'label' ] } _ { i } _overlay.png" mask . save ( os . path . join ( output_dir , mask_filename )) # Create and save overlay composite = Image . alpha_composite ( im . convert ( 'RGBA' ), overlay ) composite . save ( os . path . join ( output_dir , overlay_filename )) print ( f "Saved mask and overlay for { item [ 'label' ] } to { output_dir } " ) # Example usage if __name__ == "__main__" : extract_segmentation_masks ( "path/to/image.png" ) Check the segmentation example in the cookbook guide for a more detailed example. An example segmentation output with objects and segmentation masks Supported image formats Gemini supports the following image format MIME types: PNG - image/png JPEG - image/jpeg WEBP - image/webp HEIC - image/heic HEIF - image/heif Capabilities All Gemini model versions are multimodal and can be utilized in a wide range of image processing and computer vision tasks including but not limited to image captioning, visual question and answering, image classification, object detection and segmentation. Gemini can reduce the need to use specialized ML models depending on your quality and performance requirements. Some later model versions are specifically trained improve accuracy of specialized tasks in addition to generic capabilities: Gemini 2.0 models are further trained to support enhanced object detection . Gemini 2.5 models are further trained to support enhanced segmentation in addition to object detection . Limitations and key technical information File limit Gemini 2.5 Pro/Flash, 2.0 Flash, 1.5 Pro, and 1.5 Flash support a maximum of 3,600 image files per request. Token calculation Gemini 1.5 Flash and Gemini 1.5 Pro : 258 tokens if both dimensions <= 384 pixels. Larger images are tiled (min tile 256px, max 768px, resized to 768x768), with each tile costing 258 tokens. Gemini 2.0 Flash and Gemini 2.5 Flash/Pro : 258 tokens if both dimensions <= 384 pixels. Larger images are tiled into 768x768 pixel tiles, each costing 258 tokens. Tips and best practices Verify that images are correctly rotated. Use clear, non-blurry images. When using a single image with text, place the text prompt after the image part in the contents array. What's next This guide shows you how to upload image files and generate text outputs from image inputs. To learn more, see the following resources: Files API : Learn more about uploading and managing files for use with Gemini. System instructions : System instructions let you steer the behavior of the model based on your specific needs and use cases. File prompting strategies : The Gemini API supports prompting with text, image, audio, and video data, also known as multimodal prompting. Safety guidance : Sometimes generative AI models produce unexpected outputs, such as outputs that are inaccurate, biased, or offensive. Post-processing and human evaluation are essential to limit the risk of harm from such outputs. Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-08 UTC. \ No newline at end of file diff --git a/text_content/docs_image-understanding_9161e6f0.txt b/text_content/docs_image-understanding_9161e6f0.txt new file mode 100644 index 0000000000000000000000000000000000000000..71945e97f20dcea5ce2c5c7e4c028e8b129f3635 --- /dev/null +++ b/text_content/docs_image-understanding_9161e6f0.txt @@ -0,0 +1,5 @@ +URL: https://ai.google.dev/gemini-api/docs/image-understanding#image-input +Title: Image understanding  |  Gemini API  |  Google AI for Developers +================================================== + +Image understanding | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Image understanding Gemini models are built to be multimodal from the ground up, unlocking a wide range of image processing and computer vision tasks including but not limited to image captioning, classification, and visual question answering without having to train specialized ML models. Tip: In addition to their general multimodal capabilities, Gemini models (2.0 and newer) offer improved accuracy for specific use cases like object detection and segmentation , through additional training. See the Capabilities section for more details. Passing images to Gemini You can provide images as input to Gemini using two methods: Passing inline image data : Ideal for smaller files (total request size less than 20MB, including prompts). Uploading images using the File API : Recommended for larger files or for reusing images across multiple requests. Passing inline image data You can pass inline image data in the request to generateContent . You can provide image data as Base64 encoded strings or by reading local files directly (depending on the language). The following example shows how to read an image from a local file and pass it to generateContent API for processing. Python from google.genai import types with open ( 'path/to/small-sample.jpg' , 'rb' ) as f : image_bytes = f . read () response = client . models . generate_content ( model = 'gemini-2.5-flash' , contents = [ types . Part . from_bytes ( data = image_bytes , mime_type = 'image/jpeg' , ), 'Caption this image.' ] ) print ( response . text ) JavaScript import { GoogleGenAI } from "@google/genai" ; import * as fs from "node:fs" ; const ai = new GoogleGenAI ({}); const base64ImageFile = fs . readFileSync ( "path/to/small-sample.jpg" , { encoding : "base64" , }); const contents = [ { inlineData : { mimeType : "image/jpeg" , data : base64ImageFile , }, }, { text : "Caption this image." }, ]; const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : contents , }); console . log ( response . text ); Go bytes , _ := os . ReadFile ( "path/to/small-sample.jpg" ) parts := [] * genai . Part { genai . NewPartFromBytes ( bytes , "image/jpeg" ), genai . NewPartFromText ( "Caption this image." ), } contents := [] * genai . Content { genai . NewContentFromParts ( parts , genai . RoleUser ), } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , contents , nil , ) fmt . Println ( result . Text ()) REST IMG_PATH = "/path/to/your/image1.jpg" if [[ " $( base64 --version 2>&1 ) " = * "FreeBSD" * ]] ; then B64FLAGS = "--input" else B64FLAGS = "-w0" fi curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [{ "parts":[ { "inline_data": { "mime_type":"image/jpeg", "data": "' " $( base64 $B64FLAGS $IMG_PATH ) " '" } }, {"text": "Caption this image."}, ] }] }' 2 > /dev/null You can also fetch an image from a URL, convert it to bytes, and pass it to generateContent as shown in the following examples. Python from google import genai from google.genai import types import requests image_path = "https://goo.gle/instrument-img" image_bytes = requests . get ( image_path ) . content image = types . Part . from_bytes ( data = image_bytes , mime_type = "image/jpeg" ) client = genai . Client () response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = [ "What is this image?" , image ], ) print ( response . text ) JavaScript import { GoogleGenAI } from "@google/genai" ; async function main () { const ai = new GoogleGenAI ({}); const imageUrl = "https://goo.gle/instrument-img" ; const response = await fetch ( imageUrl ); const imageArrayBuffer = await response . arrayBuffer (); const base64ImageData = Buffer . from ( imageArrayBuffer ). toString ( 'base64' ); const result = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : [ { inlineData : { mimeType : 'image/jpeg' , data : base64ImageData , }, }, { text : "Caption this image." } ], }); console . log ( result . text ); } main (); Go package main import ( "context" "fmt" "os" "io" "net/http" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } // Download the image. imageResp , _ := http . Get ( "https://goo.gle/instrument-img" ) imageBytes , _ := io . ReadAll ( imageResp . Body ) parts := [] * genai . Part { genai . NewPartFromBytes ( imageBytes , "image/jpeg" ), genai . NewPartFromText ( "Caption this image." ), } contents := [] * genai . Content { genai . NewContentFromParts ( parts , genai . RoleUser ), } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , contents , nil , ) fmt . Println ( result . Text ()) } REST IMG_URL = "https://goo.gle/instrument-img" MIME_TYPE = $( curl -sIL " $IMG_URL " | grep -i '^content-type:' | awk -F ': ' '{print $2}' | sed 's/\r$//' | head -n 1 ) if [[ -z " $MIME_TYPE " || ! " $MIME_TYPE " == image/* ]] ; then MIME_TYPE = "image/jpeg" fi # Check for macOS if [[ " $( uname ) " == "Darwin" ]] ; then IMAGE_B64 = $( curl -sL " $IMG_URL " | base64 -b 0 ) elif [[ " $( base64 --version 2>&1 ) " = * "FreeBSD" * ]] ; then IMAGE_B64 = $( curl -sL " $IMG_URL " | base64 ) else IMAGE_B64 = $( curl -sL " $IMG_URL " | base64 -w0 ) fi curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [{ "parts":[ { "inline_data": { "mime_type":"' " $MIME_TYPE " '", "data": "' " $IMAGE_B64 " '" } }, {"text": "Caption this image."} ] }] }' 2 > /dev/null Note: Inline image data limits your total request size (text prompts, system instructions, and inline bytes) to 20MB. For larger requests, upload image files using the File API. Files API is also more efficient for scenarios that use the same image repeatedly. Uploading images using the File API For large files or to be able to use the same image file repeatedly, use the Files API. The following code uploads an image file and then uses the file in a call to generateContent . See the Files API guide for more information and examples. Python from google import genai client = genai . Client () my_file = client . files . upload ( file = "path/to/sample.jpg" ) response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = [ my_file , "Caption this image." ], ) print ( response . text ) JavaScript import { GoogleGenAI , createUserContent , createPartFromUri , } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const myfile = await ai . files . upload ({ file : "path/to/sample.jpg" , config : { mimeType : "image/jpeg" }, }); const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : createUserContent ([ createPartFromUri ( myfile . uri , myfile . mimeType ), "Caption this image." , ]), }); console . log ( response . text ); } await main (); Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } uploadedFile , _ := client . Files . UploadFromPath ( ctx , "path/to/sample.jpg" , nil ) parts := [] * genai . Part { genai . NewPartFromText ( "Caption this image." ), genai . NewPartFromURI ( uploadedFile . URI , uploadedFile . MIMEType ), } contents := [] * genai . Content { genai . NewContentFromParts ( parts , genai . RoleUser ), } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , contents , nil , ) fmt . Println ( result . Text ()) } REST IMAGE_PATH = "path/to/sample.jpg" MIME_TYPE = $( file -b --mime-type " ${ IMAGE_PATH } " ) NUM_BYTES = $( wc -c < " ${ IMAGE_PATH } " ) DISPLAY_NAME = IMAGE tmp_header_file = upload-header.tmp # Initial resumable request defining metadata. # The upload url is in the response headers dump them to a file. curl "https://generativelanguage.googleapis.com/upload/v1beta/files" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -D upload-header.tmp \ -H "X-Goog-Upload-Protocol: resumable" \ -H "X-Goog-Upload-Command: start" \ -H "X-Goog-Upload-Header-Content-Length: ${ NUM_BYTES } " \ -H "X-Goog-Upload-Header-Content-Type: ${ MIME_TYPE } " \ -H "Content-Type: application/json" \ -d "{'file': {'display_name': ' ${ DISPLAY_NAME } '}}" 2 > /dev/null upload_url = $( grep -i "x-goog-upload-url: " " ${ tmp_header_file } " | cut -d " " -f2 | tr -d "\r" ) rm " ${ tmp_header_file } " # Upload the actual bytes. curl " ${ upload_url } " \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H "Content-Length: ${ NUM_BYTES } " \ -H "X-Goog-Upload-Offset: 0" \ -H "X-Goog-Upload-Command: upload, finalize" \ --data-binary "@ ${ IMAGE_PATH } " 2 > /dev/null > file_info.json file_uri = $( jq -r ".file.uri" file_info.json ) echo file_uri = $file_uri # Now generate content using that file curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [{ "parts":[ {"file_data":{"mime_type": "' " ${ MIME_TYPE } " '", "file_uri": "' " ${ file_uri } " '"}}, {"text": "Caption this image."}] }] }' 2 > /dev/null > response.json cat response.json echo jq ".candidates[].content.parts[].text" response.json Prompting with multiple images You can provide multiple images in a single prompt by including multiple image Part objects in the contents array. These can be a mix of inline data (local files or URLs) and File API references. Python from google import genai from google.genai import types client = genai . Client () # Upload the first image image1_path = "path/to/image1.jpg" uploaded_file = client . files . upload ( file = image1_path ) # Prepare the second image as inline data image2_path = "path/to/image2.png" with open ( image2_path , 'rb' ) as f : img2_bytes = f . read () # Create the prompt with text and multiple images response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = [ "What is different between these two images?" , uploaded_file , # Use the uploaded file reference types . Part . from_bytes ( data = img2_bytes , mime_type = 'image/png' ) ] ) print ( response . text ) JavaScript import { GoogleGenAI , createUserContent , createPartFromUri , } from "@google/genai" ; import * as fs from "node:fs" ; const ai = new GoogleGenAI ({}); async function main () { // Upload the first image const image1_path = "path/to/image1.jpg" ; const uploadedFile = await ai . files . upload ({ file : image1_path , config : { mimeType : "image/jpeg" }, }); // Prepare the second image as inline data const image2_path = "path/to/image2.png" ; const base64Image2File = fs . readFileSync ( image2_path , { encoding : "base64" , }); // Create the prompt with text and multiple images const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : createUserContent ([ "What is different between these two images?" , createPartFromUri ( uploadedFile . uri , uploadedFile . mimeType ), { inlineData : { mimeType : "image/png" , data : base64Image2File , }, }, ]), }); console . log ( response . text ); } await main (); Go // Upload the first image image1Path := "path/to/image1.jpg" uploadedFile , _ := client . Files . UploadFromPath ( ctx , image1Path , nil ) // Prepare the second image as inline data image2Path := "path/to/image2.jpeg" imgBytes , _ := os . ReadFile ( image2Path ) parts := [] * genai . Part { genai . NewPartFromText ( "What is different between these two images?" ), genai . NewPartFromBytes ( imgBytes , "image/jpeg" ), genai . NewPartFromURI ( uploadedFile . URI , uploadedFile . MIMEType ), } contents := [] * genai . Content { genai . NewContentFromParts ( parts , genai . RoleUser ), } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , contents , nil , ) fmt . Println ( result . Text ()) REST # Upload the first image IMAGE1_PATH = "path/to/image1.jpg" MIME1_TYPE = $( file -b --mime-type " ${ IMAGE1_PATH } " ) NUM1_BYTES = $( wc -c < " ${ IMAGE1_PATH } " ) DISPLAY_NAME1 = IMAGE1 tmp_header_file1 = upload-header1.tmp curl "https://generativelanguage.googleapis.com/upload/v1beta/files" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -D upload-header1.tmp \ -H "X-Goog-Upload-Protocol: resumable" \ -H "X-Goog-Upload-Command: start" \ -H "X-Goog-Upload-Header-Content-Length: ${ NUM1_BYTES } " \ -H "X-Goog-Upload-Header-Content-Type: ${ MIME1_TYPE } " \ -H "Content-Type: application/json" \ -d "{'file': {'display_name': ' ${ DISPLAY_NAME1 } '}}" 2 > /dev/null upload_url1 = $( grep -i "x-goog-upload-url: " " ${ tmp_header_file1 } " | cut -d " " -f2 | tr -d "\r" ) rm " ${ tmp_header_file1 } " curl " ${ upload_url1 } " \ -H "Content-Length: ${ NUM1_BYTES } " \ -H "X-Goog-Upload-Offset: 0" \ -H "X-Goog-Upload-Command: upload, finalize" \ --data-binary "@ ${ IMAGE1_PATH } " 2 > /dev/null > file_info1.json file1_uri = $( jq ".file.uri" file_info1.json ) echo file1_uri = $file1_uri # Prepare the second image (inline) IMAGE2_PATH = "path/to/image2.png" MIME2_TYPE = $( file -b --mime-type " ${ IMAGE2_PATH } " ) if [[ " $( base64 --version 2>&1 ) " = * "FreeBSD" * ]] ; then B64FLAGS = "--input" else B64FLAGS = "-w0" fi IMAGE2_BASE64 = $( base64 $B64FLAGS $IMAGE2_PATH ) # Now generate content using both images curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [{ "parts":[ {"text": "What is different between these two images?"}, {"file_data":{"mime_type": "' " ${ MIME1_TYPE } " '", "file_uri": ' $file1_uri '}}, { "inline_data": { "mime_type":"' " ${ MIME2_TYPE } " '", "data": "' " $IMAGE2_BASE64 " '" } } ] }] }' 2 > /dev/null > response.json cat response.json echo jq ".candidates[].content.parts[].text" response.json Object detection From Gemini 2.0 onwards, models are further trained to detect objects in an image and get their bounding box coordinates. The coordinates, relative to image dimensions, scale to [0, 1000]. You need to descale these coordinates based on your original image size. Python from google import genai from google.genai import types from PIL import Image import json client = genai . Client () prompt = "Detect the all of the prominent items in the image. The box_2d should be [ymin, xmin, ymax, xmax] normalized to 0-1000." image = Image . open ( "/path/to/image.png" ) config = types . GenerateContentConfig ( response_mime_type = "application/json" ) response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = [ image , prompt ], config = config ) width , height = image . size bounding_boxes = json . loads ( response . text ) converted_bounding_boxes = [] for bounding_box in bounding_boxes : abs_y1 = int ( bounding_box [ "box_2d" ][ 0 ] / 1000 * height ) abs_x1 = int ( bounding_box [ "box_2d" ][ 1 ] / 1000 * width ) abs_y2 = int ( bounding_box [ "box_2d" ][ 2 ] / 1000 * height ) abs_x2 = int ( bounding_box [ "box_2d" ][ 3 ] / 1000 * width ) converted_bounding_boxes . append ([ abs_x1 , abs_y1 , abs_x2 , abs_y2 ]) print ( "Image size: " , width , height ) print ( "Bounding boxes:" , converted_bounding_boxes ) Note: The model also supports generating bounding boxes based on custom instructions, such as: "Show bounding boxes of all green objects in this image". It also support custom labels like "label the items with the allergens they can contain". For more examples, check following notebooks in the Gemini Cookbook : 2D spatial understanding notebook Experimental 3D pointing notebook Segmentation Starting with Gemini 2.5, models not only detect items but also segment them and provide their contour masks. The model predicts a JSON list, where each item represents a segmentation mask. Each item has a bounding box (" box_2d ") in the format [y0, x0, y1, x1] with normalized coordinates between 0 and 1000, a label (" label ") that identifies the object, and finally the segmentation mask inside the bounding box, as base64 encoded png that is a probability map with values between 0 and 255. The mask needs to be resized to match the bounding box dimensions, then binarized at your confidence threshold (127 for the midpoint). Note: For better results, disable thinking by setting the thinking budget to 0. See code sample below for an example. Python from google import genai from google.genai import types from PIL import Image , ImageDraw import io import base64 import json import numpy as np import os client = genai . Client () def parse_json ( json_output : str ): # Parsing out the markdown fencing lines = json_output . splitlines () for i , line in enumerate ( lines ): if line == "```json" : json_output = " \n " . join ( lines [ i + 1 :]) # Remove everything before "```json" output = json_output . split ( "```" )[ 0 ] # Remove everything after the closing "```" break # Exit the loop once "```json" is found return json_output def extract_segmentation_masks ( image_path : str , output_dir : str = "segmentation_outputs" ): # Load and resize image im = Image . open ( image_path ) im . thumbnail ([ 1024 , 1024 ], Image . Resampling . LANCZOS ) prompt = """ Give the segmentation masks for the wooden and glass items. Output a JSON list of segmentation masks where each entry contains the 2D bounding box in the key "box_2d", the segmentation mask in key "mask", and the text label in the key "label". Use descriptive labels. """ config = types . GenerateContentConfig ( thinking_config = types . ThinkingConfig ( thinking_budget = 0 ) # set thinking_budget to 0 for better results in object detection ) response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = [ prompt , im ], # Pillow images can be directly passed as inputs (which will be converted by the SDK) config = config ) # Parse JSON response items = json . loads ( parse_json ( response . text )) # Create output directory os . makedirs ( output_dir , exist_ok = True ) # Process each mask for i , item in enumerate ( items ): # Get bounding box coordinates box = item [ "box_2d" ] y0 = int ( box [ 0 ] / 1000 * im . size [ 1 ]) x0 = int ( box [ 1 ] / 1000 * im . size [ 0 ]) y1 = int ( box [ 2 ] / 1000 * im . size [ 1 ]) x1 = int ( box [ 3 ] / 1000 * im . size [ 0 ]) # Skip invalid boxes if y0 > = y1 or x0 > = x1 : continue # Process mask png_str = item [ "mask" ] if not png_str . startswith ( "data:image/png;base64," ): continue # Remove prefix png_str = png_str . removeprefix ( "data:image/png;base64," ) mask_data = base64 . b64decode ( png_str ) mask = Image . open ( io . BytesIO ( mask_data )) # Resize mask to match bounding box mask = mask . resize (( x1 - x0 , y1 - y0 ), Image . Resampling . BILINEAR ) # Convert mask to numpy array for processing mask_array = np . array ( mask ) # Create overlay for this mask overlay = Image . new ( 'RGBA' , im . size , ( 0 , 0 , 0 , 0 )) overlay_draw = ImageDraw . Draw ( overlay ) # Create overlay for the mask color = ( 255 , 255 , 255 , 200 ) for y in range ( y0 , y1 ): for x in range ( x0 , x1 ): if mask_array [ y - y0 , x - x0 ] > 128 : # Threshold for mask overlay_draw . point (( x , y ), fill = color ) # Save individual mask and its overlay mask_filename = f " { item [ 'label' ] } _ { i } _mask.png" overlay_filename = f " { item [ 'label' ] } _ { i } _overlay.png" mask . save ( os . path . join ( output_dir , mask_filename )) # Create and save overlay composite = Image . alpha_composite ( im . convert ( 'RGBA' ), overlay ) composite . save ( os . path . join ( output_dir , overlay_filename )) print ( f "Saved mask and overlay for { item [ 'label' ] } to { output_dir } " ) # Example usage if __name__ == "__main__" : extract_segmentation_masks ( "path/to/image.png" ) Check the segmentation example in the cookbook guide for a more detailed example. An example segmentation output with objects and segmentation masks Supported image formats Gemini supports the following image format MIME types: PNG - image/png JPEG - image/jpeg WEBP - image/webp HEIC - image/heic HEIF - image/heif Capabilities All Gemini model versions are multimodal and can be utilized in a wide range of image processing and computer vision tasks including but not limited to image captioning, visual question and answering, image classification, object detection and segmentation. Gemini can reduce the need to use specialized ML models depending on your quality and performance requirements. Some later model versions are specifically trained improve accuracy of specialized tasks in addition to generic capabilities: Gemini 2.0 models are further trained to support enhanced object detection . Gemini 2.5 models are further trained to support enhanced segmentation in addition to object detection . Limitations and key technical information File limit Gemini 2.5 Pro/Flash, 2.0 Flash, 1.5 Pro, and 1.5 Flash support a maximum of 3,600 image files per request. Token calculation Gemini 1.5 Flash and Gemini 1.5 Pro : 258 tokens if both dimensions <= 384 pixels. Larger images are tiled (min tile 256px, max 768px, resized to 768x768), with each tile costing 258 tokens. Gemini 2.0 Flash and Gemini 2.5 Flash/Pro : 258 tokens if both dimensions <= 384 pixels. Larger images are tiled into 768x768 pixel tiles, each costing 258 tokens. Tips and best practices Verify that images are correctly rotated. Use clear, non-blurry images. When using a single image with text, place the text prompt after the image part in the contents array. What's next This guide shows you how to upload image files and generate text outputs from image inputs. To learn more, see the following resources: Files API : Learn more about uploading and managing files for use with Gemini. System instructions : System instructions let you steer the behavior of the model based on your specific needs and use cases. File prompting strategies : The Gemini API supports prompting with text, image, audio, and video data, also known as multimodal prompting. Safety guidance : Sometimes generative AI models produce unexpected outputs, such as outputs that are inaccurate, biased, or offensive. Post-processing and human evaluation are essential to limit the risk of harm from such outputs. Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-08 UTC. \ No newline at end of file diff --git a/text_content/docs_image-understanding_b610765a.txt b/text_content/docs_image-understanding_b610765a.txt new file mode 100644 index 0000000000000000000000000000000000000000..b32e52e308ed53a9e50ca56416c8422a070f0a21 --- /dev/null +++ b/text_content/docs_image-understanding_b610765a.txt @@ -0,0 +1,5 @@ +URL: https://ai.google.dev/gemini-api/docs/image-understanding#object-detection +Title: Image understanding  |  Gemini API  |  Google AI for Developers +================================================== + +Image understanding | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Image understanding Gemini models are built to be multimodal from the ground up, unlocking a wide range of image processing and computer vision tasks including but not limited to image captioning, classification, and visual question answering without having to train specialized ML models. Tip: In addition to their general multimodal capabilities, Gemini models (2.0 and newer) offer improved accuracy for specific use cases like object detection and segmentation , through additional training. See the Capabilities section for more details. Passing images to Gemini You can provide images as input to Gemini using two methods: Passing inline image data : Ideal for smaller files (total request size less than 20MB, including prompts). Uploading images using the File API : Recommended for larger files or for reusing images across multiple requests. Passing inline image data You can pass inline image data in the request to generateContent . You can provide image data as Base64 encoded strings or by reading local files directly (depending on the language). The following example shows how to read an image from a local file and pass it to generateContent API for processing. Python from google.genai import types with open ( 'path/to/small-sample.jpg' , 'rb' ) as f : image_bytes = f . read () response = client . models . generate_content ( model = 'gemini-2.5-flash' , contents = [ types . Part . from_bytes ( data = image_bytes , mime_type = 'image/jpeg' , ), 'Caption this image.' ] ) print ( response . text ) JavaScript import { GoogleGenAI } from "@google/genai" ; import * as fs from "node:fs" ; const ai = new GoogleGenAI ({}); const base64ImageFile = fs . readFileSync ( "path/to/small-sample.jpg" , { encoding : "base64" , }); const contents = [ { inlineData : { mimeType : "image/jpeg" , data : base64ImageFile , }, }, { text : "Caption this image." }, ]; const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : contents , }); console . log ( response . text ); Go bytes , _ := os . ReadFile ( "path/to/small-sample.jpg" ) parts := [] * genai . Part { genai . NewPartFromBytes ( bytes , "image/jpeg" ), genai . NewPartFromText ( "Caption this image." ), } contents := [] * genai . Content { genai . NewContentFromParts ( parts , genai . RoleUser ), } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , contents , nil , ) fmt . Println ( result . Text ()) REST IMG_PATH = "/path/to/your/image1.jpg" if [[ " $( base64 --version 2>&1 ) " = * "FreeBSD" * ]] ; then B64FLAGS = "--input" else B64FLAGS = "-w0" fi curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [{ "parts":[ { "inline_data": { "mime_type":"image/jpeg", "data": "' " $( base64 $B64FLAGS $IMG_PATH ) " '" } }, {"text": "Caption this image."}, ] }] }' 2 > /dev/null You can also fetch an image from a URL, convert it to bytes, and pass it to generateContent as shown in the following examples. Python from google import genai from google.genai import types import requests image_path = "https://goo.gle/instrument-img" image_bytes = requests . get ( image_path ) . content image = types . Part . from_bytes ( data = image_bytes , mime_type = "image/jpeg" ) client = genai . Client () response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = [ "What is this image?" , image ], ) print ( response . text ) JavaScript import { GoogleGenAI } from "@google/genai" ; async function main () { const ai = new GoogleGenAI ({}); const imageUrl = "https://goo.gle/instrument-img" ; const response = await fetch ( imageUrl ); const imageArrayBuffer = await response . arrayBuffer (); const base64ImageData = Buffer . from ( imageArrayBuffer ). toString ( 'base64' ); const result = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : [ { inlineData : { mimeType : 'image/jpeg' , data : base64ImageData , }, }, { text : "Caption this image." } ], }); console . log ( result . text ); } main (); Go package main import ( "context" "fmt" "os" "io" "net/http" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } // Download the image. imageResp , _ := http . Get ( "https://goo.gle/instrument-img" ) imageBytes , _ := io . ReadAll ( imageResp . Body ) parts := [] * genai . Part { genai . NewPartFromBytes ( imageBytes , "image/jpeg" ), genai . NewPartFromText ( "Caption this image." ), } contents := [] * genai . Content { genai . NewContentFromParts ( parts , genai . RoleUser ), } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , contents , nil , ) fmt . Println ( result . Text ()) } REST IMG_URL = "https://goo.gle/instrument-img" MIME_TYPE = $( curl -sIL " $IMG_URL " | grep -i '^content-type:' | awk -F ': ' '{print $2}' | sed 's/\r$//' | head -n 1 ) if [[ -z " $MIME_TYPE " || ! " $MIME_TYPE " == image/* ]] ; then MIME_TYPE = "image/jpeg" fi # Check for macOS if [[ " $( uname ) " == "Darwin" ]] ; then IMAGE_B64 = $( curl -sL " $IMG_URL " | base64 -b 0 ) elif [[ " $( base64 --version 2>&1 ) " = * "FreeBSD" * ]] ; then IMAGE_B64 = $( curl -sL " $IMG_URL " | base64 ) else IMAGE_B64 = $( curl -sL " $IMG_URL " | base64 -w0 ) fi curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [{ "parts":[ { "inline_data": { "mime_type":"' " $MIME_TYPE " '", "data": "' " $IMAGE_B64 " '" } }, {"text": "Caption this image."} ] }] }' 2 > /dev/null Note: Inline image data limits your total request size (text prompts, system instructions, and inline bytes) to 20MB. For larger requests, upload image files using the File API. Files API is also more efficient for scenarios that use the same image repeatedly. Uploading images using the File API For large files or to be able to use the same image file repeatedly, use the Files API. The following code uploads an image file and then uses the file in a call to generateContent . See the Files API guide for more information and examples. Python from google import genai client = genai . Client () my_file = client . files . upload ( file = "path/to/sample.jpg" ) response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = [ my_file , "Caption this image." ], ) print ( response . text ) JavaScript import { GoogleGenAI , createUserContent , createPartFromUri , } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const myfile = await ai . files . upload ({ file : "path/to/sample.jpg" , config : { mimeType : "image/jpeg" }, }); const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : createUserContent ([ createPartFromUri ( myfile . uri , myfile . mimeType ), "Caption this image." , ]), }); console . log ( response . text ); } await main (); Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } uploadedFile , _ := client . Files . UploadFromPath ( ctx , "path/to/sample.jpg" , nil ) parts := [] * genai . Part { genai . NewPartFromText ( "Caption this image." ), genai . NewPartFromURI ( uploadedFile . URI , uploadedFile . MIMEType ), } contents := [] * genai . Content { genai . NewContentFromParts ( parts , genai . RoleUser ), } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , contents , nil , ) fmt . Println ( result . Text ()) } REST IMAGE_PATH = "path/to/sample.jpg" MIME_TYPE = $( file -b --mime-type " ${ IMAGE_PATH } " ) NUM_BYTES = $( wc -c < " ${ IMAGE_PATH } " ) DISPLAY_NAME = IMAGE tmp_header_file = upload-header.tmp # Initial resumable request defining metadata. # The upload url is in the response headers dump them to a file. curl "https://generativelanguage.googleapis.com/upload/v1beta/files" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -D upload-header.tmp \ -H "X-Goog-Upload-Protocol: resumable" \ -H "X-Goog-Upload-Command: start" \ -H "X-Goog-Upload-Header-Content-Length: ${ NUM_BYTES } " \ -H "X-Goog-Upload-Header-Content-Type: ${ MIME_TYPE } " \ -H "Content-Type: application/json" \ -d "{'file': {'display_name': ' ${ DISPLAY_NAME } '}}" 2 > /dev/null upload_url = $( grep -i "x-goog-upload-url: " " ${ tmp_header_file } " | cut -d " " -f2 | tr -d "\r" ) rm " ${ tmp_header_file } " # Upload the actual bytes. curl " ${ upload_url } " \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H "Content-Length: ${ NUM_BYTES } " \ -H "X-Goog-Upload-Offset: 0" \ -H "X-Goog-Upload-Command: upload, finalize" \ --data-binary "@ ${ IMAGE_PATH } " 2 > /dev/null > file_info.json file_uri = $( jq -r ".file.uri" file_info.json ) echo file_uri = $file_uri # Now generate content using that file curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [{ "parts":[ {"file_data":{"mime_type": "' " ${ MIME_TYPE } " '", "file_uri": "' " ${ file_uri } " '"}}, {"text": "Caption this image."}] }] }' 2 > /dev/null > response.json cat response.json echo jq ".candidates[].content.parts[].text" response.json Prompting with multiple images You can provide multiple images in a single prompt by including multiple image Part objects in the contents array. These can be a mix of inline data (local files or URLs) and File API references. Python from google import genai from google.genai import types client = genai . Client () # Upload the first image image1_path = "path/to/image1.jpg" uploaded_file = client . files . upload ( file = image1_path ) # Prepare the second image as inline data image2_path = "path/to/image2.png" with open ( image2_path , 'rb' ) as f : img2_bytes = f . read () # Create the prompt with text and multiple images response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = [ "What is different between these two images?" , uploaded_file , # Use the uploaded file reference types . Part . from_bytes ( data = img2_bytes , mime_type = 'image/png' ) ] ) print ( response . text ) JavaScript import { GoogleGenAI , createUserContent , createPartFromUri , } from "@google/genai" ; import * as fs from "node:fs" ; const ai = new GoogleGenAI ({}); async function main () { // Upload the first image const image1_path = "path/to/image1.jpg" ; const uploadedFile = await ai . files . upload ({ file : image1_path , config : { mimeType : "image/jpeg" }, }); // Prepare the second image as inline data const image2_path = "path/to/image2.png" ; const base64Image2File = fs . readFileSync ( image2_path , { encoding : "base64" , }); // Create the prompt with text and multiple images const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : createUserContent ([ "What is different between these two images?" , createPartFromUri ( uploadedFile . uri , uploadedFile . mimeType ), { inlineData : { mimeType : "image/png" , data : base64Image2File , }, }, ]), }); console . log ( response . text ); } await main (); Go // Upload the first image image1Path := "path/to/image1.jpg" uploadedFile , _ := client . Files . UploadFromPath ( ctx , image1Path , nil ) // Prepare the second image as inline data image2Path := "path/to/image2.jpeg" imgBytes , _ := os . ReadFile ( image2Path ) parts := [] * genai . Part { genai . NewPartFromText ( "What is different between these two images?" ), genai . NewPartFromBytes ( imgBytes , "image/jpeg" ), genai . NewPartFromURI ( uploadedFile . URI , uploadedFile . MIMEType ), } contents := [] * genai . Content { genai . NewContentFromParts ( parts , genai . RoleUser ), } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , contents , nil , ) fmt . Println ( result . Text ()) REST # Upload the first image IMAGE1_PATH = "path/to/image1.jpg" MIME1_TYPE = $( file -b --mime-type " ${ IMAGE1_PATH } " ) NUM1_BYTES = $( wc -c < " ${ IMAGE1_PATH } " ) DISPLAY_NAME1 = IMAGE1 tmp_header_file1 = upload-header1.tmp curl "https://generativelanguage.googleapis.com/upload/v1beta/files" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -D upload-header1.tmp \ -H "X-Goog-Upload-Protocol: resumable" \ -H "X-Goog-Upload-Command: start" \ -H "X-Goog-Upload-Header-Content-Length: ${ NUM1_BYTES } " \ -H "X-Goog-Upload-Header-Content-Type: ${ MIME1_TYPE } " \ -H "Content-Type: application/json" \ -d "{'file': {'display_name': ' ${ DISPLAY_NAME1 } '}}" 2 > /dev/null upload_url1 = $( grep -i "x-goog-upload-url: " " ${ tmp_header_file1 } " | cut -d " " -f2 | tr -d "\r" ) rm " ${ tmp_header_file1 } " curl " ${ upload_url1 } " \ -H "Content-Length: ${ NUM1_BYTES } " \ -H "X-Goog-Upload-Offset: 0" \ -H "X-Goog-Upload-Command: upload, finalize" \ --data-binary "@ ${ IMAGE1_PATH } " 2 > /dev/null > file_info1.json file1_uri = $( jq ".file.uri" file_info1.json ) echo file1_uri = $file1_uri # Prepare the second image (inline) IMAGE2_PATH = "path/to/image2.png" MIME2_TYPE = $( file -b --mime-type " ${ IMAGE2_PATH } " ) if [[ " $( base64 --version 2>&1 ) " = * "FreeBSD" * ]] ; then B64FLAGS = "--input" else B64FLAGS = "-w0" fi IMAGE2_BASE64 = $( base64 $B64FLAGS $IMAGE2_PATH ) # Now generate content using both images curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [{ "parts":[ {"text": "What is different between these two images?"}, {"file_data":{"mime_type": "' " ${ MIME1_TYPE } " '", "file_uri": ' $file1_uri '}}, { "inline_data": { "mime_type":"' " ${ MIME2_TYPE } " '", "data": "' " $IMAGE2_BASE64 " '" } } ] }] }' 2 > /dev/null > response.json cat response.json echo jq ".candidates[].content.parts[].text" response.json Object detection From Gemini 2.0 onwards, models are further trained to detect objects in an image and get their bounding box coordinates. The coordinates, relative to image dimensions, scale to [0, 1000]. You need to descale these coordinates based on your original image size. Python from google import genai from google.genai import types from PIL import Image import json client = genai . Client () prompt = "Detect the all of the prominent items in the image. The box_2d should be [ymin, xmin, ymax, xmax] normalized to 0-1000." image = Image . open ( "/path/to/image.png" ) config = types . GenerateContentConfig ( response_mime_type = "application/json" ) response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = [ image , prompt ], config = config ) width , height = image . size bounding_boxes = json . loads ( response . text ) converted_bounding_boxes = [] for bounding_box in bounding_boxes : abs_y1 = int ( bounding_box [ "box_2d" ][ 0 ] / 1000 * height ) abs_x1 = int ( bounding_box [ "box_2d" ][ 1 ] / 1000 * width ) abs_y2 = int ( bounding_box [ "box_2d" ][ 2 ] / 1000 * height ) abs_x2 = int ( bounding_box [ "box_2d" ][ 3 ] / 1000 * width ) converted_bounding_boxes . append ([ abs_x1 , abs_y1 , abs_x2 , abs_y2 ]) print ( "Image size: " , width , height ) print ( "Bounding boxes:" , converted_bounding_boxes ) Note: The model also supports generating bounding boxes based on custom instructions, such as: "Show bounding boxes of all green objects in this image". It also support custom labels like "label the items with the allergens they can contain". For more examples, check following notebooks in the Gemini Cookbook : 2D spatial understanding notebook Experimental 3D pointing notebook Segmentation Starting with Gemini 2.5, models not only detect items but also segment them and provide their contour masks. The model predicts a JSON list, where each item represents a segmentation mask. Each item has a bounding box (" box_2d ") in the format [y0, x0, y1, x1] with normalized coordinates between 0 and 1000, a label (" label ") that identifies the object, and finally the segmentation mask inside the bounding box, as base64 encoded png that is a probability map with values between 0 and 255. The mask needs to be resized to match the bounding box dimensions, then binarized at your confidence threshold (127 for the midpoint). Note: For better results, disable thinking by setting the thinking budget to 0. See code sample below for an example. Python from google import genai from google.genai import types from PIL import Image , ImageDraw import io import base64 import json import numpy as np import os client = genai . Client () def parse_json ( json_output : str ): # Parsing out the markdown fencing lines = json_output . splitlines () for i , line in enumerate ( lines ): if line == "```json" : json_output = " \n " . join ( lines [ i + 1 :]) # Remove everything before "```json" output = json_output . split ( "```" )[ 0 ] # Remove everything after the closing "```" break # Exit the loop once "```json" is found return json_output def extract_segmentation_masks ( image_path : str , output_dir : str = "segmentation_outputs" ): # Load and resize image im = Image . open ( image_path ) im . thumbnail ([ 1024 , 1024 ], Image . Resampling . LANCZOS ) prompt = """ Give the segmentation masks for the wooden and glass items. Output a JSON list of segmentation masks where each entry contains the 2D bounding box in the key "box_2d", the segmentation mask in key "mask", and the text label in the key "label". Use descriptive labels. """ config = types . GenerateContentConfig ( thinking_config = types . ThinkingConfig ( thinking_budget = 0 ) # set thinking_budget to 0 for better results in object detection ) response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = [ prompt , im ], # Pillow images can be directly passed as inputs (which will be converted by the SDK) config = config ) # Parse JSON response items = json . loads ( parse_json ( response . text )) # Create output directory os . makedirs ( output_dir , exist_ok = True ) # Process each mask for i , item in enumerate ( items ): # Get bounding box coordinates box = item [ "box_2d" ] y0 = int ( box [ 0 ] / 1000 * im . size [ 1 ]) x0 = int ( box [ 1 ] / 1000 * im . size [ 0 ]) y1 = int ( box [ 2 ] / 1000 * im . size [ 1 ]) x1 = int ( box [ 3 ] / 1000 * im . size [ 0 ]) # Skip invalid boxes if y0 > = y1 or x0 > = x1 : continue # Process mask png_str = item [ "mask" ] if not png_str . startswith ( "data:image/png;base64," ): continue # Remove prefix png_str = png_str . removeprefix ( "data:image/png;base64," ) mask_data = base64 . b64decode ( png_str ) mask = Image . open ( io . BytesIO ( mask_data )) # Resize mask to match bounding box mask = mask . resize (( x1 - x0 , y1 - y0 ), Image . Resampling . BILINEAR ) # Convert mask to numpy array for processing mask_array = np . array ( mask ) # Create overlay for this mask overlay = Image . new ( 'RGBA' , im . size , ( 0 , 0 , 0 , 0 )) overlay_draw = ImageDraw . Draw ( overlay ) # Create overlay for the mask color = ( 255 , 255 , 255 , 200 ) for y in range ( y0 , y1 ): for x in range ( x0 , x1 ): if mask_array [ y - y0 , x - x0 ] > 128 : # Threshold for mask overlay_draw . point (( x , y ), fill = color ) # Save individual mask and its overlay mask_filename = f " { item [ 'label' ] } _ { i } _mask.png" overlay_filename = f " { item [ 'label' ] } _ { i } _overlay.png" mask . save ( os . path . join ( output_dir , mask_filename )) # Create and save overlay composite = Image . alpha_composite ( im . convert ( 'RGBA' ), overlay ) composite . save ( os . path . join ( output_dir , overlay_filename )) print ( f "Saved mask and overlay for { item [ 'label' ] } to { output_dir } " ) # Example usage if __name__ == "__main__" : extract_segmentation_masks ( "path/to/image.png" ) Check the segmentation example in the cookbook guide for a more detailed example. An example segmentation output with objects and segmentation masks Supported image formats Gemini supports the following image format MIME types: PNG - image/png JPEG - image/jpeg WEBP - image/webp HEIC - image/heic HEIF - image/heif Capabilities All Gemini model versions are multimodal and can be utilized in a wide range of image processing and computer vision tasks including but not limited to image captioning, visual question and answering, image classification, object detection and segmentation. Gemini can reduce the need to use specialized ML models depending on your quality and performance requirements. Some later model versions are specifically trained improve accuracy of specialized tasks in addition to generic capabilities: Gemini 2.0 models are further trained to support enhanced object detection . Gemini 2.5 models are further trained to support enhanced segmentation in addition to object detection . Limitations and key technical information File limit Gemini 2.5 Pro/Flash, 2.0 Flash, 1.5 Pro, and 1.5 Flash support a maximum of 3,600 image files per request. Token calculation Gemini 1.5 Flash and Gemini 1.5 Pro : 258 tokens if both dimensions <= 384 pixels. Larger images are tiled (min tile 256px, max 768px, resized to 768x768), with each tile costing 258 tokens. Gemini 2.0 Flash and Gemini 2.5 Flash/Pro : 258 tokens if both dimensions <= 384 pixels. Larger images are tiled into 768x768 pixel tiles, each costing 258 tokens. Tips and best practices Verify that images are correctly rotated. Use clear, non-blurry images. When using a single image with text, place the text prompt after the image part in the contents array. What's next This guide shows you how to upload image files and generate text outputs from image inputs. To learn more, see the following resources: Files API : Learn more about uploading and managing files for use with Gemini. System instructions : System instructions let you steer the behavior of the model based on your specific needs and use cases. File prompting strategies : The Gemini API supports prompting with text, image, audio, and video data, also known as multimodal prompting. Safety guidance : Sometimes generative AI models produce unexpected outputs, such as outputs that are inaccurate, biased, or offensive. Post-processing and human evaluation are essential to limit the risk of harm from such outputs. Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-08 UTC. \ No newline at end of file diff --git a/text_content/docs_libraries_0645df72.txt b/text_content/docs_libraries_0645df72.txt new file mode 100644 index 0000000000000000000000000000000000000000..3d6bd472f7dd71b43848bf0161227ec03f119e5a --- /dev/null +++ b/text_content/docs_libraries_0645df72.txt @@ -0,0 +1,5 @@ +URL: https://ai.google.dev/gemini-api/docs/libraries#new-libraries +Title: Gemini API libraries  |  Google AI for Developers +================================================== + +Gemini API libraries | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Gemini API libraries When building with the Gemini API, we recommend using our official collection of libraries across major languages: the Google GenAI SDK . They are production ready under General Availability . Our samples and documentation across this site are built using these libraries. Note: If you're using one of our legacy libraries, we strongly recommend you migrate to the Google GenAI SDK. Review the legacy libraries section for more information. If you're new to the Gemini API, follow our quickstart guide to get started. Language support and installation The Google GenAI SDK is available for the Python, JavaScript/TypeScript, Go and Java languages. You can install each language's library using package managers, or visit their GitHub repos for further engagement: Python Library: google-genai GitHub Repository: googleapis/python-genai Installation: pip install google-genai JavaScript Library: @google/genai GitHub Repository: googleapis/js-genai Installation: npm install @google/genai Go Library: google.golang.org/genai GitHub Repository: googleapis/go-genai Installation: go get google.golang.org/genai Java Library: google-genai GitHub Repository: googleapis/java-genai Installation: If you're using Maven, add the following to your dependencies: com.google.genai google-genai 1.0.0 General availability We started rolling out the Google GenAI SDK in late 2024. As of May 2025, it reached General Availability (GA) across all supported platforms. This means the libraries are stable and fully supported for production use. They are actively maintained, provide access to the latest features, and offer the best performance working with Gemini. If you're not using the Google GenAI SDK and using one of our legacy libraries, we strongly recommend you to migrate. Review the legacy libraries section for more information. Legacy libraries and migration If you are using one of our legacy libraries, we recommend that you migrate to the new libraries . The legacy libraries don't provide access to recent features (such as Live API and Veo ) and are on a deprecation path. They will stop receiving updates at the end of September 2025, the feature gaps will grow and potential bugs may no longer get fixed. Each legacy library's support status varies, detailed in the following table: Language Legacy library Support status Recommended library Python google-generativeai All support, including bug fixes, ends end of September 2025. google-genai JavaScript/TypeScript @google/generativeai All support, including bug fixes, ends end of September 2025. @google/genai Go google.golang.org/generative-ai All support, including bug fixes, ends end of September 2025. google.golang.org/genai Dart and Flutter google_generative_ai Not actively maintained Use trusted community or third party libraries, like firebase_ai , or access using REST API Swift generative-ai-swift Not actively maintained Use Gemini in Firebase Android generative-ai-android Not actively maintained Use Gemini in Firebase Note for Java developers: There was no legacy Google-provided Java SDK for the Gemini API, so no migration from a previous Google library is required. You can start directly with the new library in the Language support and installation section. Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-09 UTC. \ No newline at end of file diff --git a/text_content/docs_libraries_21f3891c.txt b/text_content/docs_libraries_21f3891c.txt new file mode 100644 index 0000000000000000000000000000000000000000..dc66d3920f613dd49b1cca04614238f057fbbdae --- /dev/null +++ b/text_content/docs_libraries_21f3891c.txt @@ -0,0 +1,5 @@ +URL: https://ai.google.dev/gemini-api/docs/libraries#previous-sdks +Title: Gemini API libraries  |  Google AI for Developers +================================================== + +Gemini API libraries | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Gemini API libraries When building with the Gemini API, we recommend using our official collection of libraries across major languages: the Google GenAI SDK . They are production ready under General Availability . Our samples and documentation across this site are built using these libraries. Note: If you're using one of our legacy libraries, we strongly recommend you migrate to the Google GenAI SDK. Review the legacy libraries section for more information. If you're new to the Gemini API, follow our quickstart guide to get started. Language support and installation The Google GenAI SDK is available for the Python, JavaScript/TypeScript, Go and Java languages. You can install each language's library using package managers, or visit their GitHub repos for further engagement: Python Library: google-genai GitHub Repository: googleapis/python-genai Installation: pip install google-genai JavaScript Library: @google/genai GitHub Repository: googleapis/js-genai Installation: npm install @google/genai Go Library: google.golang.org/genai GitHub Repository: googleapis/go-genai Installation: go get google.golang.org/genai Java Library: google-genai GitHub Repository: googleapis/java-genai Installation: If you're using Maven, add the following to your dependencies: com.google.genai google-genai 1.0.0 General availability We started rolling out the Google GenAI SDK in late 2024. As of May 2025, it reached General Availability (GA) across all supported platforms. This means the libraries are stable and fully supported for production use. They are actively maintained, provide access to the latest features, and offer the best performance working with Gemini. If you're not using the Google GenAI SDK and using one of our legacy libraries, we strongly recommend you to migrate. Review the legacy libraries section for more information. Legacy libraries and migration If you are using one of our legacy libraries, we recommend that you migrate to the new libraries . The legacy libraries don't provide access to recent features (such as Live API and Veo ) and are on a deprecation path. They will stop receiving updates at the end of September 2025, the feature gaps will grow and potential bugs may no longer get fixed. Each legacy library's support status varies, detailed in the following table: Language Legacy library Support status Recommended library Python google-generativeai All support, including bug fixes, ends end of September 2025. google-genai JavaScript/TypeScript @google/generativeai All support, including bug fixes, ends end of September 2025. @google/genai Go google.golang.org/generative-ai All support, including bug fixes, ends end of September 2025. google.golang.org/genai Dart and Flutter google_generative_ai Not actively maintained Use trusted community or third party libraries, like firebase_ai , or access using REST API Swift generative-ai-swift Not actively maintained Use Gemini in Firebase Android generative-ai-android Not actively maintained Use Gemini in Firebase Note for Java developers: There was no legacy Google-provided Java SDK for the Gemini API, so no migration from a previous Google library is required. You can start directly with the new library in the Language support and installation section. Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-09 UTC. \ No newline at end of file diff --git a/text_content/docs_libraries_e582d95a.txt b/text_content/docs_libraries_e582d95a.txt new file mode 100644 index 0000000000000000000000000000000000000000..43a09bea93b9fee0a3c9c26506b1b788997d2cb5 --- /dev/null +++ b/text_content/docs_libraries_e582d95a.txt @@ -0,0 +1,5 @@ +URL: https://ai.google.dev/gemini-api/docs/libraries +Title: Gemini API libraries  |  Google AI for Developers +================================================== + +Gemini API libraries | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Gemini API libraries When building with the Gemini API, we recommend using our official collection of libraries across major languages: the Google GenAI SDK . They are production ready under General Availability . Our samples and documentation across this site are built using these libraries. Note: If you're using one of our legacy libraries, we strongly recommend you migrate to the Google GenAI SDK. Review the legacy libraries section for more information. If you're new to the Gemini API, follow our quickstart guide to get started. Language support and installation The Google GenAI SDK is available for the Python, JavaScript/TypeScript, Go and Java languages. You can install each language's library using package managers, or visit their GitHub repos for further engagement: Python Library: google-genai GitHub Repository: googleapis/python-genai Installation: pip install google-genai JavaScript Library: @google/genai GitHub Repository: googleapis/js-genai Installation: npm install @google/genai Go Library: google.golang.org/genai GitHub Repository: googleapis/go-genai Installation: go get google.golang.org/genai Java Library: google-genai GitHub Repository: googleapis/java-genai Installation: If you're using Maven, add the following to your dependencies: com.google.genai google-genai 1.0.0 General availability We started rolling out the Google GenAI SDK in late 2024. As of May 2025, it reached General Availability (GA) across all supported platforms. This means the libraries are stable and fully supported for production use. They are actively maintained, provide access to the latest features, and offer the best performance working with Gemini. If you're not using the Google GenAI SDK and using one of our legacy libraries, we strongly recommend you to migrate. Review the legacy libraries section for more information. Legacy libraries and migration If you are using one of our legacy libraries, we recommend that you migrate to the new libraries . The legacy libraries don't provide access to recent features (such as Live API and Veo ) and are on a deprecation path. They will stop receiving updates at the end of September 2025, the feature gaps will grow and potential bugs may no longer get fixed. Each legacy library's support status varies, detailed in the following table: Language Legacy library Support status Recommended library Python google-generativeai All support, including bug fixes, ends end of September 2025. google-genai JavaScript/TypeScript @google/generativeai All support, including bug fixes, ends end of September 2025. @google/genai Go google.golang.org/generative-ai All support, including bug fixes, ends end of September 2025. google.golang.org/genai Dart and Flutter google_generative_ai Not actively maintained Use trusted community or third party libraries, like firebase_ai , or access using REST API Swift generative-ai-swift Not actively maintained Use Gemini in Firebase Android generative-ai-android Not actively maintained Use Gemini in Firebase Note for Java developers: There was no legacy Google-provided Java SDK for the Gemini API, so no migration from a previous Google library is required. You can start directly with the new library in the Language support and installation section. Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-09 UTC. \ No newline at end of file diff --git a/text_content/docs_live-guide_51ee6d26.txt b/text_content/docs_live-guide_51ee6d26.txt new file mode 100644 index 0000000000000000000000000000000000000000..3a2acfcaba2fe842392ec3df36ec1229eae342af --- /dev/null +++ b/text_content/docs_live-guide_51ee6d26.txt @@ -0,0 +1,5 @@ +URL: https://ai.google.dev/gemini-api/docs/live-guide#native-audio-output-thinking +Title: Live API capabilities guide  |  Gemini API  |  Google AI for Developers +================================================== + +Live API capabilities guide | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Live API capabilities guide Preview: The Live API is in preview. This is a comprehensive guide that covers capabilities and configurations available with the Live API. See Get started with Live API page for a overview and sample code for common use cases. Before you begin Familiarize yourself with core concepts: If you haven't already done so, read the Get started with Live API page first. This will introduce you to the fundamental principles of the Live API, how it works, and the distinction between the different models and their corresponding audio generation methods ( native audio or half-cascade). Try the Live API in AI Studio: You may find it useful to try the Live API in Google AI Studio before you start building. To use the Live API in Google AI Studio, select Stream . Establishing a connection The following example shows how to create a connection with an API key: Python import asyncio from google import genai client = genai . Client () model = "gemini-live-2.5-flash-preview" config = { "response_modalities" : [ "TEXT" ]} async def main (): async with client . aio . live . connect ( model = model , config = config ) as session : print ( "Session started" ) if __name__ == "__main__" : asyncio . run ( main ()) JavaScript import { GoogleGenAI , Modality } from '@google/genai' ; const ai = new GoogleGenAI ({}); const model = 'gemini-live-2.5-flash-preview' ; const config = { responseModalities : [ Modality . TEXT ] }; async function main () { const session = await ai . live . connect ({ model : model , callbacks : { onopen : function () { console . debug ( 'Opened' ); }, onmessage : function ( message ) { console . debug ( message ); }, onerror : function ( e ) { console . debug ( 'Error:' , e . message ); }, onclose : function ( e ) { console . debug ( 'Close:' , e . reason ); }, }, config : config , }); // Send content... session . close (); } main (); Note: You can only set one modality in the response_modalities field. This means that you can configure the model to respond with either text or audio, but not both in the same session. Interaction modalities The following sections provide examples and supporting context for the different input and output modalities available in Live API. Sending and receiving text Here's how you can send and receive text: Python import asyncio from google import genai client = genai . Client () model = "gemini-live-2.5-flash-preview" config = { "response_modalities" : [ "TEXT" ]} async def main (): async with client . aio . live . connect ( model = model , config = config ) as session : message = "Hello, how are you?" await session . send_client_content ( turns = { "role" : "user" , "parts" : [{ "text" : message }]}, turn_complete = True ) async for response in session . receive (): if response . text is not None : print ( response . text , end = "" ) if __name__ == "__main__" : asyncio . run ( main ()) JavaScript import { GoogleGenAI , Modality } from '@google/genai' ; const ai = new GoogleGenAI ({}); const model = 'gemini-live-2.5-flash-preview' ; const config = { responseModalities : [ Modality . TEXT ] }; async function live () { const responseQueue = []; async function waitMessage () { let done = false ; let message = undefined ; while ( ! done ) { message = responseQueue . shift (); if ( message ) { done = true ; } else { await new Promise (( resolve ) = > setTimeout ( resolve , 100 )); } } return message ; } async function handleTurn () { const turns = []; let done = false ; while ( ! done ) { const message = await waitMessage (); turns . push ( message ); if ( message . serverContent && message . serverContent . turnComplete ) { done = true ; } } return turns ; } const session = await ai . live . connect ({ model : model , callbacks : { onopen : function () { console . debug ( 'Opened' ); }, onmessage : function ( message ) { responseQueue . push ( message ); }, onerror : function ( e ) { console . debug ( 'Error:' , e . message ); }, onclose : function ( e ) { console . debug ( 'Close:' , e . reason ); }, }, config : config , }); const inputTurns = 'Hello how are you?' ; session . sendClientContent ({ turns : inputTurns }); const turns = await handleTurn (); for ( const turn of turns ) { if ( turn . text ) { console . debug ( 'Received text: %s\n' , turn . text ); } else if ( turn . data ) { console . debug ( 'Received inline data: %s\n' , turn . data ); } } session . close (); } async function main () { await live (). catch (( e ) = > console . error ( 'got error' , e )); } main (); Incremental content updates Use incremental updates to send text input, establish session context, or restore session context. For short contexts you can send turn-by-turn interactions to represent the exact sequence of events: Python turns = [ { "role" : "user" , "parts" : [{ "text" : "What is the capital of France?" }]}, { "role" : "model" , "parts" : [{ "text" : "Paris" }]}, ] await session . send_client_content ( turns = turns , turn_complete = False ) turns = [{ "role" : "user" , "parts" : [{ "text" : "What is the capital of Germany?" }]}] await session . send_client_content ( turns = turns , turn_complete = True ) JavaScript let inputTurns = [ { "role" : "user" , "parts" : [{ "text" : "What is the capital of France?" }] }, { "role" : "model" , "parts" : [{ "text" : "Paris" }] }, ] session . sendClientContent ({ turns : inputTurns , turnComplete : false }) inputTurns = [{ "role" : "user" , "parts" : [{ "text" : "What is the capital of Germany?" }] }] session . sendClientContent ({ turns : inputTurns , turnComplete : true }) For longer contexts it's recommended to provide a single message summary to free up the context window for subsequent interactions. See Session Resumption for another method for loading session context. Sending and receiving audio The most common audio example, audio-to-audio , is covered in the Getting started guide. Here's an audio-to-text example that reads a WAV file, sends it in the correct format and receives text output: Python # Test file: https://storage.googleapis.com/generativeai-downloads/data/16000.wav # Install helpers for converting files: pip install librosa soundfile import asyncio import io from pathlib import Path from google import genai from google.genai import types import soundfile as sf import librosa client = genai . Client () model = "gemini-live-2.5-flash-preview" config = { "response_modalities" : [ "TEXT" ]} async def main (): async with client . aio . live . connect ( model = model , config = config ) as session : buffer = io . BytesIO () y , sr = librosa . load ( "sample.wav" , sr = 16000 ) sf . write ( buffer , y , sr , format = 'RAW' , subtype = 'PCM_16' ) buffer . seek ( 0 ) audio_bytes = buffer . read () # If already in correct format, you can use this: # audio_bytes = Path("sample.pcm").read_bytes() await session . send_realtime_input ( audio = types . Blob ( data = audio_bytes , mime_type = "audio/pcm;rate=16000" ) ) async for response in session . receive (): if response . text is not None : print ( response . text ) if __name__ == "__main__" : asyncio . run ( main ()) JavaScript // Test file: https://storage.googleapis.com/generativeai-downloads/data/16000.wav // Install helpers for converting files: npm install wavefile import { GoogleGenAI , Modality } from '@google/genai' ; import * as fs from "node:fs" ; import pkg from 'wavefile' ; const { WaveFile } = pkg ; const ai = new GoogleGenAI ({}); const model = 'gemini-live-2.5-flash-preview' ; const config = { responseModalities : [ Modality . TEXT ] }; async function live () { const responseQueue = []; async function waitMessage () { let done = false ; let message = undefined ; while ( ! done ) { message = responseQueue . shift (); if ( message ) { done = true ; } else { await new Promise (( resolve ) = > setTimeout ( resolve , 100 )); } } return message ; } async function handleTurn () { const turns = []; let done = false ; while ( ! done ) { const message = await waitMessage (); turns . push ( message ); if ( message . serverContent && message . serverContent . turnComplete ) { done = true ; } } return turns ; } const session = await ai . live . connect ({ model : model , callbacks : { onopen : function () { console . debug ( 'Opened' ); }, onmessage : function ( message ) { responseQueue . push ( message ); }, onerror : function ( e ) { console . debug ( 'Error:' , e . message ); }, onclose : function ( e ) { console . debug ( 'Close:' , e . reason ); }, }, config : config , }); // Send Audio Chunk const fileBuffer = fs . readFileSync ( "sample.wav" ); // Ensure audio conforms to API requirements (16-bit PCM, 16kHz, mono) const wav = new WaveFile (); wav . fromBuffer ( fileBuffer ); wav . toSampleRate ( 16000 ); wav . toBitDepth ( "16" ); const base64Audio = wav . toBase64 (); // If already in correct format, you can use this: // const fileBuffer = fs.readFileSync("sample.pcm"); // const base64Audio = Buffer.from(fileBuffer).toString('base64'); session . sendRealtimeInput ( { audio : { data : base64Audio , mimeType : "audio/pcm;rate=16000" } } ); const turns = await handleTurn (); for ( const turn of turns ) { if ( turn . text ) { console . debug ( 'Received text: %s\n' , turn . text ); } else if ( turn . data ) { console . debug ( 'Received inline data: %s\n' , turn . data ); } } session . close (); } async function main () { await live (). catch (( e ) = > console . error ( 'got error' , e )); } main (); And here is a text-to-audio example. You can receive audio by setting AUDIO as response modality. This example saves the received data as WAV file: Python import asyncio import wave from google import genai client = genai . Client () model = "gemini-live-2.5-flash-preview" config = { "response_modalities" : [ "AUDIO" ]} async def main (): async with client . aio . live . connect ( model = model , config = config ) as session : wf = wave . open ( "audio.wav" , "wb" ) wf . setnchannels ( 1 ) wf . setsampwidth ( 2 ) wf . setframerate ( 24000 ) message = "Hello how are you?" await session . send_client_content ( turns = { "role" : "user" , "parts" : [{ "text" : message }]}, turn_complete = True ) async for response in session . receive (): if response . data is not None : wf . writeframes ( response . data ) # Un-comment this code to print audio data info # if response.server_content.model_turn is not None: # print(response.server_content.model_turn.parts[0].inline_data.mime_type) wf . close () if __name__ == "__main__" : asyncio . run ( main ()) JavaScript import { GoogleGenAI , Modality } from '@google/genai' ; import * as fs from "node:fs" ; import pkg from 'wavefile' ; const { WaveFile } = pkg ; const ai = new GoogleGenAI ({}); const model = 'gemini-live-2.5-flash-preview' ; const config = { responseModalities : [ Modality . AUDIO ] }; async function live () { const responseQueue = []; async function waitMessage () { let done = false ; let message = undefined ; while ( ! done ) { message = responseQueue . shift (); if ( message ) { done = true ; } else { await new Promise (( resolve ) = > setTimeout ( resolve , 100 )); } } return message ; } async function handleTurn () { const turns = []; let done = false ; while ( ! done ) { const message = await waitMessage (); turns . push ( message ); if ( message . serverContent && message . serverContent . turnComplete ) { done = true ; } } return turns ; } const session = await ai . live . connect ({ model : model , callbacks : { onopen : function () { console . debug ( 'Opened' ); }, onmessage : function ( message ) { responseQueue . push ( message ); }, onerror : function ( e ) { console . debug ( 'Error:' , e . message ); }, onclose : function ( e ) { console . debug ( 'Close:' , e . reason ); }, }, config : config , }); const inputTurns = 'Hello how are you?' ; session . sendClientContent ({ turns : inputTurns }); const turns = await handleTurn (); // Combine audio data strings and save as wave file const combinedAudio = turns . reduce (( acc , turn ) = > { if ( turn . data ) { const buffer = Buffer . from ( turn . data , 'base64' ); const intArray = new Int16Array ( buffer . buffer , buffer . byteOffset , buffer . byteLength / Int16Array . BYTES_PER_ELEMENT ); return acc . concat ( Array . from ( intArray )); } return acc ; }, []); const audioBuffer = new Int16Array ( combinedAudio ); const wf = new WaveFile (); wf . fromScratch ( 1 , 24000 , '16' , audioBuffer ); fs . writeFileSync ( 'output.wav' , wf . toBuffer ()); session . close (); } async function main () { await live (). catch (( e ) = > console . error ( 'got error' , e )); } main (); Audio formats Audio data in the Live API is always raw, little-endian, 16-bit PCM. Audio output always uses a sample rate of 24kHz. Input audio is natively 16kHz, but the Live API will resample if needed so any sample rate can be sent. To convey the sample rate of input audio, set the MIME type of each audio-containing Blob to a value like audio/pcm;rate=16000 . Audio transcriptions You can enable transcription of the model's audio output by sending output_audio_transcription in the setup config. The transcription language is inferred from the model's response. Python import asyncio from google import genai from google.genai import types client = genai . Client () model = "gemini-live-2.5-flash-preview" config = { "response_modalities" : [ "AUDIO" ], "output_audio_transcription" : {} } async def main (): async with client . aio . live . connect ( model = model , config = config ) as session : message = "Hello? Gemini are you there?" await session . send_client_content ( turns = { "role" : "user" , "parts" : [{ "text" : message }]}, turn_complete = True ) async for response in session . receive (): if response . server_content . model_turn : print ( "Model turn:" , response . server_content . model_turn ) if response . server_content . output_transcription : print ( "Transcript:" , response . server_content . output_transcription . text ) if __name__ == "__main__" : asyncio . run ( main ()) JavaScript import { GoogleGenAI , Modality } from '@google/genai' ; const ai = new GoogleGenAI ({}); const model = 'gemini-live-2.5-flash-preview' ; const config = { responseModalities : [ Modality . AUDIO ], outputAudioTranscription : {} }; async function live () { const responseQueue = []; async function waitMessage () { let done = false ; let message = undefined ; while ( ! done ) { message = responseQueue . shift (); if ( message ) { done = true ; } else { await new Promise (( resolve ) = > setTimeout ( resolve , 100 )); } } return message ; } async function handleTurn () { const turns = []; let done = false ; while ( ! done ) { const message = await waitMessage (); turns . push ( message ); if ( message . serverContent && message . serverContent . turnComplete ) { done = true ; } } return turns ; } const session = await ai . live . connect ({ model : model , callbacks : { onopen : function () { console . debug ( 'Opened' ); }, onmessage : function ( message ) { responseQueue . push ( message ); }, onerror : function ( e ) { console . debug ( 'Error:' , e . message ); }, onclose : function ( e ) { console . debug ( 'Close:' , e . reason ); }, }, config : config , }); const inputTurns = 'Hello how are you?' ; session . sendClientContent ({ turns : inputTurns }); const turns = await handleTurn (); for ( const turn of turns ) { if ( turn . serverContent && turn . serverContent . outputTranscription ) { console . debug ( 'Received output transcription: %s\n' , turn . serverContent . outputTranscription . text ); } } session . close (); } async function main () { await live (). catch (( e ) = > console . error ( 'got error' , e )); } main (); You can enable transcription of the audio input by sending input_audio_transcription in setup config. Python import asyncio from pathlib import Path from google import genai from google.genai import types client = genai . Client () model = "gemini-live-2.5-flash-preview" config = { "response_modalities" : [ "TEXT" ], "input_audio_transcription" : {}, } async def main (): async with client . aio . live . connect ( model = model , config = config ) as session : audio_data = Path ( "16000.pcm" ) . read_bytes () await session . send_realtime_input ( audio = types . Blob ( data = audio_data , mime_type = 'audio/pcm;rate=16000' ) ) async for msg in session . receive (): if msg . server_content . input_transcription : print ( 'Transcript:' , msg . server_content . input_transcription . text ) if __name__ == "__main__" : asyncio . run ( main ()) JavaScript import { GoogleGenAI , Modality } from '@google/genai' ; import * as fs from "node:fs" ; import pkg from 'wavefile' ; const { WaveFile } = pkg ; const ai = new GoogleGenAI ({}); const model = 'gemini-live-2.5-flash-preview' ; const config = { responseModalities : [ Modality . TEXT ], inputAudioTranscription : {} }; async function live () { const responseQueue = []; async function waitMessage () { let done = false ; let message = undefined ; while ( ! done ) { message = responseQueue . shift (); if ( message ) { done = true ; } else { await new Promise (( resolve ) = > setTimeout ( resolve , 100 )); } } return message ; } async function handleTurn () { const turns = []; let done = false ; while ( ! done ) { const message = await waitMessage (); turns . push ( message ); if ( message . serverContent && message . serverContent . turnComplete ) { done = true ; } } return turns ; } const session = await ai . live . connect ({ model : model , callbacks : { onopen : function () { console . debug ( 'Opened' ); }, onmessage : function ( message ) { responseQueue . push ( message ); }, onerror : function ( e ) { console . debug ( 'Error:' , e . message ); }, onclose : function ( e ) { console . debug ( 'Close:' , e . reason ); }, }, config : config , }); // Send Audio Chunk const fileBuffer = fs . readFileSync ( "16000.wav" ); // Ensure audio conforms to API requirements (16-bit PCM, 16kHz, mono) const wav = new WaveFile (); wav . fromBuffer ( fileBuffer ); wav . toSampleRate ( 16000 ); wav . toBitDepth ( "16" ); const base64Audio = wav . toBase64 (); // If already in correct format, you can use this: // const fileBuffer = fs.readFileSync("sample.pcm"); // const base64Audio = Buffer.from(fileBuffer).toString('base64'); session . sendRealtimeInput ( { audio : { data : base64Audio , mimeType : "audio/pcm;rate=16000" } } ); const turns = await handleTurn (); for ( const turn of turns ) { if ( turn . serverContent && turn . serverContent . outputTranscription ) { console . log ( "Transcription" ) console . log ( turn . serverContent . outputTranscription . text ); } } for ( const turn of turns ) { if ( turn . text ) { console . debug ( 'Received text: %s\n' , turn . text ); } else if ( turn . data ) { console . debug ( 'Received inline data: %s\n' , turn . data ); } else if ( turn . serverContent && turn . serverContent . inputTranscription ) { console . debug ( 'Received input transcription: %s\n' , turn . serverContent . inputTranscription . text ); } } session . close (); } async function main () { await live (). catch (( e ) = > console . error ( 'got error' , e )); } main (); Stream audio and video To see an example of how to use the Live API in a streaming audio and video format, run the "Live API - Get Started" file in the cookbooks repository: View on Colab Change voice and language The Live API models each support a different set of voices. Half-cascade supports Puck, Charon, Kore, Fenrir, Aoede, Leda, Orus, and Zephyr. Native audio supports a much longer list (identical to the TTS model list ). You can listen to all the voices in AI Studio . To specify a voice, set the voice name within the speechConfig object as part of the session configuration: Python config = { "response_modalities" : [ "AUDIO" ], "speech_config" : { "voice_config" : { "prebuilt_voice_config" : { "voice_name" : "Kore" }} }, } JavaScript const config = { responseModalities : [ Modality . AUDIO ], speechConfig : { voiceConfig : { prebuiltVoiceConfig : { voiceName : "Kore" } } } }; Note: If you're using the generateContent API, the set of available voices is slightly different. See the audio generation guide for generateContent audio generation voices. The Live API supports multiple languages . To change the language, set the language code within the speechConfig object as part of the session configuration: Python config = { "response_modalities" : [ "AUDIO" ], "speech_config" : { "language_code" : "de-DE" } } JavaScript const config = { responseModalities : [ Modality . AUDIO ], speechConfig : { languageCode : "de-DE" } }; Note: Native audio output models automatically choose the appropriate language and don't support explicitly setting the language code. Native audio capabilities The following capabilities are only available with native audio. You can learn more about native audio in Choose a model and audio generation . Note: Native audio models currently have limited tool use support. See Overview of supported tools for details. How to use native audio output To use native audio output, configure one of the native audio models and set response_modalities to AUDIO . See Send and receive audio for a full example. Python model = "gemini-2.5-flash-preview-native-audio-dialog" config = types . LiveConnectConfig ( response_modalities = [ "AUDIO" ]) async with client . aio . live . connect ( model = model , config = config ) as session : # Send audio input and receive audio JavaScript const model = 'gemini-2.5-flash-preview-native-audio-dialog' ; const config = { responseModalities : [ Modality . AUDIO ] }; async function main () { const session = await ai . live . connect ({ model : model , config : config , callbacks : ..., }); // Send audio input and receive audio session . close (); } main (); Affective dialog This feature lets Gemini adapt its response style to the input expression and tone. To use affective dialog, set the api version to v1alpha and set enable_affective_dialog to true in the setup message: Python client = genai . Client ( http_options = { "api_version" : "v1alpha" }) config = types . LiveConnectConfig ( response_modalities = [ "AUDIO" ], enable_affective_dialog = True ) JavaScript const ai = new GoogleGenAI ({ httpOptions : { "apiVersion" : "v1alpha" } }); const config = { responseModalities : [ Modality . AUDIO ], enableAffectiveDialog : true }; Note that affective dialog is currently only supported by the native audio output models. Proactive audio When this feature is enabled, Gemini can proactively decide not to respond if the content is not relevant. To use it, set the api version to v1alpha and configure the proactivity field in the setup message and set proactive_audio to true : Python client = genai . Client ( http_options = { "api_version" : "v1alpha" }) config = types . LiveConnectConfig ( response_modalities = [ "AUDIO" ], proactivity = { 'proactive_audio' : True } ) JavaScript const ai = new GoogleGenAI ({ httpOptions : { "apiVersion" : "v1alpha" } }); const config = { responseModalities : [ Modality . AUDIO ], proactivity : { proactiveAudio : true } } Note that proactive audio is currently only supported by the native audio output models. Native audio output with thinking Native audio output supports thinking capabilities , available via a separate model gemini-2.5-flash-exp-native-audio-thinking-dialog . See Send and receive audio for a full example. Python model = "gemini-2.5-flash-exp-native-audio-thinking-dialog" config = types . LiveConnectConfig ( response_modalities = [ "AUDIO" ]) async with client . aio . live . connect ( model = model , config = config ) as session : # Send audio input and receive audio JavaScript const model = 'gemini-2.5-flash-exp-native-audio-thinking-dialog' ; const config = { responseModalities : [ Modality . AUDIO ] }; async function main () { const session = await ai . live . connect ({ model : model , config : config , callbacks : ..., }); // Send audio input and receive audio session . close (); } main (); Voice Activity Detection (VAD) Voice Activity Detection (VAD) allows the model to recognize when a person is speaking. This is essential for creating natural conversations, as it allows a user to interrupt the model at any time. When VAD detects an interruption, the ongoing generation is canceled and discarded. Only the information already sent to the client is retained in the session history. The server then sends a BidiGenerateContentServerContent message to report the interruption. The Gemini server then discards any pending function calls and sends a BidiGenerateContentServerContent message with the IDs of the canceled calls. Python async for response in session . receive (): if response . server_content . interrupted is True : # The generation was interrupted # If realtime playback is implemented in your application, # you should stop playing audio and clear queued playback here. JavaScript const turns = await handleTurn (); for ( const turn of turns ) { if ( turn . serverContent && turn . serverContent . interrupted ) { // The generation was interrupted // If realtime playback is implemented in your application, // you should stop playing audio and clear queued playback here. } } Automatic VAD By default, the model automatically performs VAD on a continuous audio input stream. VAD can be configured with the realtimeInputConfig.automaticActivityDetection field of the setup configuration . When the audio stream is paused for more than a second (for example, because the user switched off the microphone), an audioStreamEnd event should be sent to flush any cached audio. The client can resume sending audio data at any time. Python # example audio file to try: # URL = "https://storage.googleapis.com/generativeai-downloads/data/hello_are_you_there.pcm" # !wget -q $URL -O sample.pcm import asyncio from pathlib import Path from google import genai from google.genai import types client = genai . Client () model = "gemini-live-2.5-flash-preview" config = { "response_modalities" : [ "TEXT" ]} async def main (): async with client . aio . live . connect ( model = model , config = config ) as session : audio_bytes = Path ( "sample.pcm" ) . read_bytes () await session . send_realtime_input ( audio = types . Blob ( data = audio_bytes , mime_type = "audio/pcm;rate=16000" ) ) # if stream gets paused, send: # await session.send_realtime_input(audio_stream_end=True) async for response in session . receive (): if response . text is not None : print ( response . text ) if __name__ == "__main__" : asyncio . run ( main ()) JavaScript // example audio file to try: // URL = "https://storage.googleapis.com/generativeai-downloads/data/hello_are_you_there.pcm" // !wget -q $URL -O sample.pcm import { GoogleGenAI , Modality } from '@google/genai' ; import * as fs from "node:fs" ; const ai = new GoogleGenAI ({}); const model = 'gemini-live-2.5-flash-preview' ; const config = { responseModalities : [ Modality . TEXT ] }; async function live () { const responseQueue = []; async function waitMessage () { let done = false ; let message = undefined ; while ( ! done ) { message = responseQueue . shift (); if ( message ) { done = true ; } else { await new Promise (( resolve ) = > setTimeout ( resolve , 100 )); } } return message ; } async function handleTurn () { const turns = []; let done = false ; while ( ! done ) { const message = await waitMessage (); turns . push ( message ); if ( message . serverContent && message . serverContent . turnComplete ) { done = true ; } } return turns ; } const session = await ai . live . connect ({ model : model , callbacks : { onopen : function () { console . debug ( 'Opened' ); }, onmessage : function ( message ) { responseQueue . push ( message ); }, onerror : function ( e ) { console . debug ( 'Error:' , e . message ); }, onclose : function ( e ) { console . debug ( 'Close:' , e . reason ); }, }, config : config , }); // Send Audio Chunk const fileBuffer = fs . readFileSync ( "sample.pcm" ); const base64Audio = Buffer . from ( fileBuffer ). toString ( 'base64' ); session . sendRealtimeInput ( { audio : { data : base64Audio , mimeType : "audio/pcm;rate=16000" } } ); // if stream gets paused, send: // session.sendRealtimeInput({ audioStreamEnd: true }) const turns = await handleTurn (); for ( const turn of turns ) { if ( turn . text ) { console . debug ( 'Received text: %s\n' , turn . text ); } else if ( turn . data ) { console . debug ( 'Received inline data: %s\n' , turn . data ); } } session . close (); } async function main () { await live (). catch (( e ) = > console . error ( 'got error' , e )); } main (); With send_realtime_input , the API will respond to audio automatically based on VAD. While send_client_content adds messages to the model context in order, send_realtime_input is optimized for responsiveness at the expense of deterministic ordering. Automatic VAD configuration For more control over the VAD activity, you can configure the following parameters. See API reference for more info. Python from google.genai import types config = { "response_modalities" : [ "TEXT" ], "realtime_input_config" : { "automatic_activity_detection" : { "disabled" : False , # default "start_of_speech_sensitivity" : types . StartSensitivity . START_SENSITIVITY_LOW , "end_of_speech_sensitivity" : types . EndSensitivity . END_SENSITIVITY_LOW , "prefix_padding_ms" : 20 , "silence_duration_ms" : 100 , } } } JavaScript import { GoogleGenAI , Modality , StartSensitivity , EndSensitivity } from '@google/genai' ; const config = { responseModalities : [ Modality . TEXT ], realtimeInputConfig : { automaticActivityDetection : { disabled : false , // default startOfSpeechSensitivity : StartSensitivity . START_SENSITIVITY_LOW , endOfSpeechSensitivity : EndSensitivity . END_SENSITIVITY_LOW , prefixPaddingMs : 20 , silenceDurationMs : 100 , } } }; Disable automatic VAD Alternatively, the automatic VAD can be disabled by setting realtimeInputConfig.automaticActivityDetection.disabled to true in the setup message. In this configuration the client is responsible for detecting user speech and sending activityStart and activityEnd messages at the appropriate times. An audioStreamEnd isn't sent in this configuration. Instead, any interruption of the stream is marked by an activityEnd message. Python config = { "response_modalities" : [ "TEXT" ], "realtime_input_config" : { "automatic_activity_detection" : { "disabled" : True }}, } async with client . aio . live . connect ( model = model , config = config ) as session : # ... await session . send_realtime_input ( activity_start = types . ActivityStart ()) await session . send_realtime_input ( audio = types . Blob ( data = audio_bytes , mime_type = "audio/pcm;rate=16000" ) ) await session . send_realtime_input ( activity_end = types . ActivityEnd ()) # ... JavaScript const config = { responseModalities : [ Modality . TEXT ], realtimeInputConfig : { automaticActivityDetection : { disabled : true , } } }; session . sendRealtimeInput ({ activityStart : {} }) session . sendRealtimeInput ( { audio : { data : base64Audio , mimeType : "audio/pcm;rate=16000" } } ); session . sendRealtimeInput ({ activityEnd : {} }) Token count You can find the total number of consumed tokens in the usageMetadata field of the returned server message. Python async for message in session . receive (): # The server will periodically send messages that include UsageMetadata. if message . usage_metadata : usage = message . usage_metadata print ( f "Used { usage . total_token_count } tokens in total. Response token breakdown:" ) for detail in usage . response_tokens_details : match detail : case types . ModalityTokenCount ( modality = modality , token_count = count ): print ( f " { modality } : { count } " ) JavaScript const turns = await handleTurn (); for ( const turn of turns ) { if ( turn . usageMetadata ) { console . debug ( 'Used %s tokens in total. Response token breakdown:\n' , turn . usageMetadata . totalTokenCount ); for ( const detail of turn . usageMetadata . responseTokensDetails ) { console . debug ( '%s\n' , detail ); } } } Media resolution You can specify the media resolution for the input media by setting the mediaResolution field as part of the session configuration: Python from google.genai import types config = { "response_modalities" : [ "AUDIO" ], "media_resolution" : types . MediaResolution . MEDIA_RESOLUTION_LOW , } JavaScript import { GoogleGenAI , Modality , MediaResolution } from '@google/genai' ; const config = { responseModalities : [ Modality . TEXT ], mediaResolution : MediaResolution . MEDIA_RESOLUTION_LOW , }; Limitations Consider the following limitations of the Live API when you plan your project. Response modalities You can only set one response modality ( TEXT or AUDIO ) per session in the session configuration. Setting both results in a config error message. This means that you can configure the model to respond with either text or audio, but not both in the same session. Client authentication The Live API only provides server-to-server authentication by default. If you're implementing your Live API application using a client-to-server approach , you need to use ephemeral tokens to mitigate security risks. Session duration Audio-only sessions are limited to 15 minutes, and audio plus video sessions are limited to 2 minutes. However, you can configure different session management techniques for unlimited extensions on session duration. Context window A session has a context window limit of: 128k tokens for native audio output models 32k tokens for other Live API models Supported languages Live API supports the following languages. Note: Native audio output models automatically choose the appropriate language and don't support explicitly setting the language code. Language BCP-47 Code Language BCP-47 Code German (Germany) de-DE English (Australia)* en-AU English (UK)* en-GB English (India) en-IN English (US) en-US Spanish (US) es-US French (France) fr-FR Hindi (India) hi-IN Portuguese (Brazil) pt-BR Arabic (Generic) ar-XA Spanish (Spain)* es-ES French (Canada)* fr-CA Indonesian (Indonesia) id-ID Italian (Italy) it-IT Japanese (Japan) ja-JP Turkish (Turkey) tr-TR Vietnamese (Vietnam) vi-VN Bengali (India) bn-IN Gujarati (India)* gu-IN Kannada (India)* kn-IN Marathi (India) mr-IN Malayalam (India)* ml-IN Tamil (India) ta-IN Telugu (India) te-IN Dutch (Netherlands) nl-NL Korean (South Korea) ko-KR Mandarin Chinese (China)* cmn-CN Polish (Poland) pl-PL Russian (Russia) ru-RU Thai (Thailand) th-TH Languages marked with an asterisk (*) are not available for Native audio . What's next Read the Tool Use and Session Management guides for essential information on using the Live API effectively. Try the Live API in Google AI Studio . For more info about the Live API models, see Gemini 2.0 Flash Live and Gemini 2.5 Flash Native Audio on the Models page. Try more examples in the Live API cookbook , the Live API Tools cookbook , and the Live API Get Started script . Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-08 UTC. \ No newline at end of file diff --git a/text_content/docs_live-guide_7d294914.txt b/text_content/docs_live-guide_7d294914.txt new file mode 100644 index 0000000000000000000000000000000000000000..8d9b02913486080cf127944f3a21b674fbba36df --- /dev/null +++ b/text_content/docs_live-guide_7d294914.txt @@ -0,0 +1,5 @@ +URL: https://ai.google.dev/gemini-api/docs/live-guide#main-content +Title: Live API capabilities guide  |  Gemini API  |  Google AI for Developers +================================================== + +Live API capabilities guide | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Live API capabilities guide Preview: The Live API is in preview. This is a comprehensive guide that covers capabilities and configurations available with the Live API. See Get started with Live API page for a overview and sample code for common use cases. Before you begin Familiarize yourself with core concepts: If you haven't already done so, read the Get started with Live API page first. This will introduce you to the fundamental principles of the Live API, how it works, and the distinction between the different models and their corresponding audio generation methods ( native audio or half-cascade). Try the Live API in AI Studio: You may find it useful to try the Live API in Google AI Studio before you start building. To use the Live API in Google AI Studio, select Stream . Establishing a connection The following example shows how to create a connection with an API key: Python import asyncio from google import genai client = genai . Client () model = "gemini-live-2.5-flash-preview" config = { "response_modalities" : [ "TEXT" ]} async def main (): async with client . aio . live . connect ( model = model , config = config ) as session : print ( "Session started" ) if __name__ == "__main__" : asyncio . run ( main ()) JavaScript import { GoogleGenAI , Modality } from '@google/genai' ; const ai = new GoogleGenAI ({}); const model = 'gemini-live-2.5-flash-preview' ; const config = { responseModalities : [ Modality . TEXT ] }; async function main () { const session = await ai . live . connect ({ model : model , callbacks : { onopen : function () { console . debug ( 'Opened' ); }, onmessage : function ( message ) { console . debug ( message ); }, onerror : function ( e ) { console . debug ( 'Error:' , e . message ); }, onclose : function ( e ) { console . debug ( 'Close:' , e . reason ); }, }, config : config , }); // Send content... session . close (); } main (); Note: You can only set one modality in the response_modalities field. This means that you can configure the model to respond with either text or audio, but not both in the same session. Interaction modalities The following sections provide examples and supporting context for the different input and output modalities available in Live API. Sending and receiving text Here's how you can send and receive text: Python import asyncio from google import genai client = genai . Client () model = "gemini-live-2.5-flash-preview" config = { "response_modalities" : [ "TEXT" ]} async def main (): async with client . aio . live . connect ( model = model , config = config ) as session : message = "Hello, how are you?" await session . send_client_content ( turns = { "role" : "user" , "parts" : [{ "text" : message }]}, turn_complete = True ) async for response in session . receive (): if response . text is not None : print ( response . text , end = "" ) if __name__ == "__main__" : asyncio . run ( main ()) JavaScript import { GoogleGenAI , Modality } from '@google/genai' ; const ai = new GoogleGenAI ({}); const model = 'gemini-live-2.5-flash-preview' ; const config = { responseModalities : [ Modality . TEXT ] }; async function live () { const responseQueue = []; async function waitMessage () { let done = false ; let message = undefined ; while ( ! done ) { message = responseQueue . shift (); if ( message ) { done = true ; } else { await new Promise (( resolve ) = > setTimeout ( resolve , 100 )); } } return message ; } async function handleTurn () { const turns = []; let done = false ; while ( ! done ) { const message = await waitMessage (); turns . push ( message ); if ( message . serverContent && message . serverContent . turnComplete ) { done = true ; } } return turns ; } const session = await ai . live . connect ({ model : model , callbacks : { onopen : function () { console . debug ( 'Opened' ); }, onmessage : function ( message ) { responseQueue . push ( message ); }, onerror : function ( e ) { console . debug ( 'Error:' , e . message ); }, onclose : function ( e ) { console . debug ( 'Close:' , e . reason ); }, }, config : config , }); const inputTurns = 'Hello how are you?' ; session . sendClientContent ({ turns : inputTurns }); const turns = await handleTurn (); for ( const turn of turns ) { if ( turn . text ) { console . debug ( 'Received text: %s\n' , turn . text ); } else if ( turn . data ) { console . debug ( 'Received inline data: %s\n' , turn . data ); } } session . close (); } async function main () { await live (). catch (( e ) = > console . error ( 'got error' , e )); } main (); Incremental content updates Use incremental updates to send text input, establish session context, or restore session context. For short contexts you can send turn-by-turn interactions to represent the exact sequence of events: Python turns = [ { "role" : "user" , "parts" : [{ "text" : "What is the capital of France?" }]}, { "role" : "model" , "parts" : [{ "text" : "Paris" }]}, ] await session . send_client_content ( turns = turns , turn_complete = False ) turns = [{ "role" : "user" , "parts" : [{ "text" : "What is the capital of Germany?" }]}] await session . send_client_content ( turns = turns , turn_complete = True ) JavaScript let inputTurns = [ { "role" : "user" , "parts" : [{ "text" : "What is the capital of France?" }] }, { "role" : "model" , "parts" : [{ "text" : "Paris" }] }, ] session . sendClientContent ({ turns : inputTurns , turnComplete : false }) inputTurns = [{ "role" : "user" , "parts" : [{ "text" : "What is the capital of Germany?" }] }] session . sendClientContent ({ turns : inputTurns , turnComplete : true }) For longer contexts it's recommended to provide a single message summary to free up the context window for subsequent interactions. See Session Resumption for another method for loading session context. Sending and receiving audio The most common audio example, audio-to-audio , is covered in the Getting started guide. Here's an audio-to-text example that reads a WAV file, sends it in the correct format and receives text output: Python # Test file: https://storage.googleapis.com/generativeai-downloads/data/16000.wav # Install helpers for converting files: pip install librosa soundfile import asyncio import io from pathlib import Path from google import genai from google.genai import types import soundfile as sf import librosa client = genai . Client () model = "gemini-live-2.5-flash-preview" config = { "response_modalities" : [ "TEXT" ]} async def main (): async with client . aio . live . connect ( model = model , config = config ) as session : buffer = io . BytesIO () y , sr = librosa . load ( "sample.wav" , sr = 16000 ) sf . write ( buffer , y , sr , format = 'RAW' , subtype = 'PCM_16' ) buffer . seek ( 0 ) audio_bytes = buffer . read () # If already in correct format, you can use this: # audio_bytes = Path("sample.pcm").read_bytes() await session . send_realtime_input ( audio = types . Blob ( data = audio_bytes , mime_type = "audio/pcm;rate=16000" ) ) async for response in session . receive (): if response . text is not None : print ( response . text ) if __name__ == "__main__" : asyncio . run ( main ()) JavaScript // Test file: https://storage.googleapis.com/generativeai-downloads/data/16000.wav // Install helpers for converting files: npm install wavefile import { GoogleGenAI , Modality } from '@google/genai' ; import * as fs from "node:fs" ; import pkg from 'wavefile' ; const { WaveFile } = pkg ; const ai = new GoogleGenAI ({}); const model = 'gemini-live-2.5-flash-preview' ; const config = { responseModalities : [ Modality . TEXT ] }; async function live () { const responseQueue = []; async function waitMessage () { let done = false ; let message = undefined ; while ( ! done ) { message = responseQueue . shift (); if ( message ) { done = true ; } else { await new Promise (( resolve ) = > setTimeout ( resolve , 100 )); } } return message ; } async function handleTurn () { const turns = []; let done = false ; while ( ! done ) { const message = await waitMessage (); turns . push ( message ); if ( message . serverContent && message . serverContent . turnComplete ) { done = true ; } } return turns ; } const session = await ai . live . connect ({ model : model , callbacks : { onopen : function () { console . debug ( 'Opened' ); }, onmessage : function ( message ) { responseQueue . push ( message ); }, onerror : function ( e ) { console . debug ( 'Error:' , e . message ); }, onclose : function ( e ) { console . debug ( 'Close:' , e . reason ); }, }, config : config , }); // Send Audio Chunk const fileBuffer = fs . readFileSync ( "sample.wav" ); // Ensure audio conforms to API requirements (16-bit PCM, 16kHz, mono) const wav = new WaveFile (); wav . fromBuffer ( fileBuffer ); wav . toSampleRate ( 16000 ); wav . toBitDepth ( "16" ); const base64Audio = wav . toBase64 (); // If already in correct format, you can use this: // const fileBuffer = fs.readFileSync("sample.pcm"); // const base64Audio = Buffer.from(fileBuffer).toString('base64'); session . sendRealtimeInput ( { audio : { data : base64Audio , mimeType : "audio/pcm;rate=16000" } } ); const turns = await handleTurn (); for ( const turn of turns ) { if ( turn . text ) { console . debug ( 'Received text: %s\n' , turn . text ); } else if ( turn . data ) { console . debug ( 'Received inline data: %s\n' , turn . data ); } } session . close (); } async function main () { await live (). catch (( e ) = > console . error ( 'got error' , e )); } main (); And here is a text-to-audio example. You can receive audio by setting AUDIO as response modality. This example saves the received data as WAV file: Python import asyncio import wave from google import genai client = genai . Client () model = "gemini-live-2.5-flash-preview" config = { "response_modalities" : [ "AUDIO" ]} async def main (): async with client . aio . live . connect ( model = model , config = config ) as session : wf = wave . open ( "audio.wav" , "wb" ) wf . setnchannels ( 1 ) wf . setsampwidth ( 2 ) wf . setframerate ( 24000 ) message = "Hello how are you?" await session . send_client_content ( turns = { "role" : "user" , "parts" : [{ "text" : message }]}, turn_complete = True ) async for response in session . receive (): if response . data is not None : wf . writeframes ( response . data ) # Un-comment this code to print audio data info # if response.server_content.model_turn is not None: # print(response.server_content.model_turn.parts[0].inline_data.mime_type) wf . close () if __name__ == "__main__" : asyncio . run ( main ()) JavaScript import { GoogleGenAI , Modality } from '@google/genai' ; import * as fs from "node:fs" ; import pkg from 'wavefile' ; const { WaveFile } = pkg ; const ai = new GoogleGenAI ({}); const model = 'gemini-live-2.5-flash-preview' ; const config = { responseModalities : [ Modality . AUDIO ] }; async function live () { const responseQueue = []; async function waitMessage () { let done = false ; let message = undefined ; while ( ! done ) { message = responseQueue . shift (); if ( message ) { done = true ; } else { await new Promise (( resolve ) = > setTimeout ( resolve , 100 )); } } return message ; } async function handleTurn () { const turns = []; let done = false ; while ( ! done ) { const message = await waitMessage (); turns . push ( message ); if ( message . serverContent && message . serverContent . turnComplete ) { done = true ; } } return turns ; } const session = await ai . live . connect ({ model : model , callbacks : { onopen : function () { console . debug ( 'Opened' ); }, onmessage : function ( message ) { responseQueue . push ( message ); }, onerror : function ( e ) { console . debug ( 'Error:' , e . message ); }, onclose : function ( e ) { console . debug ( 'Close:' , e . reason ); }, }, config : config , }); const inputTurns = 'Hello how are you?' ; session . sendClientContent ({ turns : inputTurns }); const turns = await handleTurn (); // Combine audio data strings and save as wave file const combinedAudio = turns . reduce (( acc , turn ) = > { if ( turn . data ) { const buffer = Buffer . from ( turn . data , 'base64' ); const intArray = new Int16Array ( buffer . buffer , buffer . byteOffset , buffer . byteLength / Int16Array . BYTES_PER_ELEMENT ); return acc . concat ( Array . from ( intArray )); } return acc ; }, []); const audioBuffer = new Int16Array ( combinedAudio ); const wf = new WaveFile (); wf . fromScratch ( 1 , 24000 , '16' , audioBuffer ); fs . writeFileSync ( 'output.wav' , wf . toBuffer ()); session . close (); } async function main () { await live (). catch (( e ) = > console . error ( 'got error' , e )); } main (); Audio formats Audio data in the Live API is always raw, little-endian, 16-bit PCM. Audio output always uses a sample rate of 24kHz. Input audio is natively 16kHz, but the Live API will resample if needed so any sample rate can be sent. To convey the sample rate of input audio, set the MIME type of each audio-containing Blob to a value like audio/pcm;rate=16000 . Audio transcriptions You can enable transcription of the model's audio output by sending output_audio_transcription in the setup config. The transcription language is inferred from the model's response. Python import asyncio from google import genai from google.genai import types client = genai . Client () model = "gemini-live-2.5-flash-preview" config = { "response_modalities" : [ "AUDIO" ], "output_audio_transcription" : {} } async def main (): async with client . aio . live . connect ( model = model , config = config ) as session : message = "Hello? Gemini are you there?" await session . send_client_content ( turns = { "role" : "user" , "parts" : [{ "text" : message }]}, turn_complete = True ) async for response in session . receive (): if response . server_content . model_turn : print ( "Model turn:" , response . server_content . model_turn ) if response . server_content . output_transcription : print ( "Transcript:" , response . server_content . output_transcription . text ) if __name__ == "__main__" : asyncio . run ( main ()) JavaScript import { GoogleGenAI , Modality } from '@google/genai' ; const ai = new GoogleGenAI ({}); const model = 'gemini-live-2.5-flash-preview' ; const config = { responseModalities : [ Modality . AUDIO ], outputAudioTranscription : {} }; async function live () { const responseQueue = []; async function waitMessage () { let done = false ; let message = undefined ; while ( ! done ) { message = responseQueue . shift (); if ( message ) { done = true ; } else { await new Promise (( resolve ) = > setTimeout ( resolve , 100 )); } } return message ; } async function handleTurn () { const turns = []; let done = false ; while ( ! done ) { const message = await waitMessage (); turns . push ( message ); if ( message . serverContent && message . serverContent . turnComplete ) { done = true ; } } return turns ; } const session = await ai . live . connect ({ model : model , callbacks : { onopen : function () { console . debug ( 'Opened' ); }, onmessage : function ( message ) { responseQueue . push ( message ); }, onerror : function ( e ) { console . debug ( 'Error:' , e . message ); }, onclose : function ( e ) { console . debug ( 'Close:' , e . reason ); }, }, config : config , }); const inputTurns = 'Hello how are you?' ; session . sendClientContent ({ turns : inputTurns }); const turns = await handleTurn (); for ( const turn of turns ) { if ( turn . serverContent && turn . serverContent . outputTranscription ) { console . debug ( 'Received output transcription: %s\n' , turn . serverContent . outputTranscription . text ); } } session . close (); } async function main () { await live (). catch (( e ) = > console . error ( 'got error' , e )); } main (); You can enable transcription of the audio input by sending input_audio_transcription in setup config. Python import asyncio from pathlib import Path from google import genai from google.genai import types client = genai . Client () model = "gemini-live-2.5-flash-preview" config = { "response_modalities" : [ "TEXT" ], "input_audio_transcription" : {}, } async def main (): async with client . aio . live . connect ( model = model , config = config ) as session : audio_data = Path ( "16000.pcm" ) . read_bytes () await session . send_realtime_input ( audio = types . Blob ( data = audio_data , mime_type = 'audio/pcm;rate=16000' ) ) async for msg in session . receive (): if msg . server_content . input_transcription : print ( 'Transcript:' , msg . server_content . input_transcription . text ) if __name__ == "__main__" : asyncio . run ( main ()) JavaScript import { GoogleGenAI , Modality } from '@google/genai' ; import * as fs from "node:fs" ; import pkg from 'wavefile' ; const { WaveFile } = pkg ; const ai = new GoogleGenAI ({}); const model = 'gemini-live-2.5-flash-preview' ; const config = { responseModalities : [ Modality . TEXT ], inputAudioTranscription : {} }; async function live () { const responseQueue = []; async function waitMessage () { let done = false ; let message = undefined ; while ( ! done ) { message = responseQueue . shift (); if ( message ) { done = true ; } else { await new Promise (( resolve ) = > setTimeout ( resolve , 100 )); } } return message ; } async function handleTurn () { const turns = []; let done = false ; while ( ! done ) { const message = await waitMessage (); turns . push ( message ); if ( message . serverContent && message . serverContent . turnComplete ) { done = true ; } } return turns ; } const session = await ai . live . connect ({ model : model , callbacks : { onopen : function () { console . debug ( 'Opened' ); }, onmessage : function ( message ) { responseQueue . push ( message ); }, onerror : function ( e ) { console . debug ( 'Error:' , e . message ); }, onclose : function ( e ) { console . debug ( 'Close:' , e . reason ); }, }, config : config , }); // Send Audio Chunk const fileBuffer = fs . readFileSync ( "16000.wav" ); // Ensure audio conforms to API requirements (16-bit PCM, 16kHz, mono) const wav = new WaveFile (); wav . fromBuffer ( fileBuffer ); wav . toSampleRate ( 16000 ); wav . toBitDepth ( "16" ); const base64Audio = wav . toBase64 (); // If already in correct format, you can use this: // const fileBuffer = fs.readFileSync("sample.pcm"); // const base64Audio = Buffer.from(fileBuffer).toString('base64'); session . sendRealtimeInput ( { audio : { data : base64Audio , mimeType : "audio/pcm;rate=16000" } } ); const turns = await handleTurn (); for ( const turn of turns ) { if ( turn . serverContent && turn . serverContent . outputTranscription ) { console . log ( "Transcription" ) console . log ( turn . serverContent . outputTranscription . text ); } } for ( const turn of turns ) { if ( turn . text ) { console . debug ( 'Received text: %s\n' , turn . text ); } else if ( turn . data ) { console . debug ( 'Received inline data: %s\n' , turn . data ); } else if ( turn . serverContent && turn . serverContent . inputTranscription ) { console . debug ( 'Received input transcription: %s\n' , turn . serverContent . inputTranscription . text ); } } session . close (); } async function main () { await live (). catch (( e ) = > console . error ( 'got error' , e )); } main (); Stream audio and video To see an example of how to use the Live API in a streaming audio and video format, run the "Live API - Get Started" file in the cookbooks repository: View on Colab Change voice and language The Live API models each support a different set of voices. Half-cascade supports Puck, Charon, Kore, Fenrir, Aoede, Leda, Orus, and Zephyr. Native audio supports a much longer list (identical to the TTS model list ). You can listen to all the voices in AI Studio . To specify a voice, set the voice name within the speechConfig object as part of the session configuration: Python config = { "response_modalities" : [ "AUDIO" ], "speech_config" : { "voice_config" : { "prebuilt_voice_config" : { "voice_name" : "Kore" }} }, } JavaScript const config = { responseModalities : [ Modality . AUDIO ], speechConfig : { voiceConfig : { prebuiltVoiceConfig : { voiceName : "Kore" } } } }; Note: If you're using the generateContent API, the set of available voices is slightly different. See the audio generation guide for generateContent audio generation voices. The Live API supports multiple languages . To change the language, set the language code within the speechConfig object as part of the session configuration: Python config = { "response_modalities" : [ "AUDIO" ], "speech_config" : { "language_code" : "de-DE" } } JavaScript const config = { responseModalities : [ Modality . AUDIO ], speechConfig : { languageCode : "de-DE" } }; Note: Native audio output models automatically choose the appropriate language and don't support explicitly setting the language code. Native audio capabilities The following capabilities are only available with native audio. You can learn more about native audio in Choose a model and audio generation . Note: Native audio models currently have limited tool use support. See Overview of supported tools for details. How to use native audio output To use native audio output, configure one of the native audio models and set response_modalities to AUDIO . See Send and receive audio for a full example. Python model = "gemini-2.5-flash-preview-native-audio-dialog" config = types . LiveConnectConfig ( response_modalities = [ "AUDIO" ]) async with client . aio . live . connect ( model = model , config = config ) as session : # Send audio input and receive audio JavaScript const model = 'gemini-2.5-flash-preview-native-audio-dialog' ; const config = { responseModalities : [ Modality . AUDIO ] }; async function main () { const session = await ai . live . connect ({ model : model , config : config , callbacks : ..., }); // Send audio input and receive audio session . close (); } main (); Affective dialog This feature lets Gemini adapt its response style to the input expression and tone. To use affective dialog, set the api version to v1alpha and set enable_affective_dialog to true in the setup message: Python client = genai . Client ( http_options = { "api_version" : "v1alpha" }) config = types . LiveConnectConfig ( response_modalities = [ "AUDIO" ], enable_affective_dialog = True ) JavaScript const ai = new GoogleGenAI ({ httpOptions : { "apiVersion" : "v1alpha" } }); const config = { responseModalities : [ Modality . AUDIO ], enableAffectiveDialog : true }; Note that affective dialog is currently only supported by the native audio output models. Proactive audio When this feature is enabled, Gemini can proactively decide not to respond if the content is not relevant. To use it, set the api version to v1alpha and configure the proactivity field in the setup message and set proactive_audio to true : Python client = genai . Client ( http_options = { "api_version" : "v1alpha" }) config = types . LiveConnectConfig ( response_modalities = [ "AUDIO" ], proactivity = { 'proactive_audio' : True } ) JavaScript const ai = new GoogleGenAI ({ httpOptions : { "apiVersion" : "v1alpha" } }); const config = { responseModalities : [ Modality . AUDIO ], proactivity : { proactiveAudio : true } } Note that proactive audio is currently only supported by the native audio output models. Native audio output with thinking Native audio output supports thinking capabilities , available via a separate model gemini-2.5-flash-exp-native-audio-thinking-dialog . See Send and receive audio for a full example. Python model = "gemini-2.5-flash-exp-native-audio-thinking-dialog" config = types . LiveConnectConfig ( response_modalities = [ "AUDIO" ]) async with client . aio . live . connect ( model = model , config = config ) as session : # Send audio input and receive audio JavaScript const model = 'gemini-2.5-flash-exp-native-audio-thinking-dialog' ; const config = { responseModalities : [ Modality . AUDIO ] }; async function main () { const session = await ai . live . connect ({ model : model , config : config , callbacks : ..., }); // Send audio input and receive audio session . close (); } main (); Voice Activity Detection (VAD) Voice Activity Detection (VAD) allows the model to recognize when a person is speaking. This is essential for creating natural conversations, as it allows a user to interrupt the model at any time. When VAD detects an interruption, the ongoing generation is canceled and discarded. Only the information already sent to the client is retained in the session history. The server then sends a BidiGenerateContentServerContent message to report the interruption. The Gemini server then discards any pending function calls and sends a BidiGenerateContentServerContent message with the IDs of the canceled calls. Python async for response in session . receive (): if response . server_content . interrupted is True : # The generation was interrupted # If realtime playback is implemented in your application, # you should stop playing audio and clear queued playback here. JavaScript const turns = await handleTurn (); for ( const turn of turns ) { if ( turn . serverContent && turn . serverContent . interrupted ) { // The generation was interrupted // If realtime playback is implemented in your application, // you should stop playing audio and clear queued playback here. } } Automatic VAD By default, the model automatically performs VAD on a continuous audio input stream. VAD can be configured with the realtimeInputConfig.automaticActivityDetection field of the setup configuration . When the audio stream is paused for more than a second (for example, because the user switched off the microphone), an audioStreamEnd event should be sent to flush any cached audio. The client can resume sending audio data at any time. Python # example audio file to try: # URL = "https://storage.googleapis.com/generativeai-downloads/data/hello_are_you_there.pcm" # !wget -q $URL -O sample.pcm import asyncio from pathlib import Path from google import genai from google.genai import types client = genai . Client () model = "gemini-live-2.5-flash-preview" config = { "response_modalities" : [ "TEXT" ]} async def main (): async with client . aio . live . connect ( model = model , config = config ) as session : audio_bytes = Path ( "sample.pcm" ) . read_bytes () await session . send_realtime_input ( audio = types . Blob ( data = audio_bytes , mime_type = "audio/pcm;rate=16000" ) ) # if stream gets paused, send: # await session.send_realtime_input(audio_stream_end=True) async for response in session . receive (): if response . text is not None : print ( response . text ) if __name__ == "__main__" : asyncio . run ( main ()) JavaScript // example audio file to try: // URL = "https://storage.googleapis.com/generativeai-downloads/data/hello_are_you_there.pcm" // !wget -q $URL -O sample.pcm import { GoogleGenAI , Modality } from '@google/genai' ; import * as fs from "node:fs" ; const ai = new GoogleGenAI ({}); const model = 'gemini-live-2.5-flash-preview' ; const config = { responseModalities : [ Modality . TEXT ] }; async function live () { const responseQueue = []; async function waitMessage () { let done = false ; let message = undefined ; while ( ! done ) { message = responseQueue . shift (); if ( message ) { done = true ; } else { await new Promise (( resolve ) = > setTimeout ( resolve , 100 )); } } return message ; } async function handleTurn () { const turns = []; let done = false ; while ( ! done ) { const message = await waitMessage (); turns . push ( message ); if ( message . serverContent && message . serverContent . turnComplete ) { done = true ; } } return turns ; } const session = await ai . live . connect ({ model : model , callbacks : { onopen : function () { console . debug ( 'Opened' ); }, onmessage : function ( message ) { responseQueue . push ( message ); }, onerror : function ( e ) { console . debug ( 'Error:' , e . message ); }, onclose : function ( e ) { console . debug ( 'Close:' , e . reason ); }, }, config : config , }); // Send Audio Chunk const fileBuffer = fs . readFileSync ( "sample.pcm" ); const base64Audio = Buffer . from ( fileBuffer ). toString ( 'base64' ); session . sendRealtimeInput ( { audio : { data : base64Audio , mimeType : "audio/pcm;rate=16000" } } ); // if stream gets paused, send: // session.sendRealtimeInput({ audioStreamEnd: true }) const turns = await handleTurn (); for ( const turn of turns ) { if ( turn . text ) { console . debug ( 'Received text: %s\n' , turn . text ); } else if ( turn . data ) { console . debug ( 'Received inline data: %s\n' , turn . data ); } } session . close (); } async function main () { await live (). catch (( e ) = > console . error ( 'got error' , e )); } main (); With send_realtime_input , the API will respond to audio automatically based on VAD. While send_client_content adds messages to the model context in order, send_realtime_input is optimized for responsiveness at the expense of deterministic ordering. Automatic VAD configuration For more control over the VAD activity, you can configure the following parameters. See API reference for more info. Python from google.genai import types config = { "response_modalities" : [ "TEXT" ], "realtime_input_config" : { "automatic_activity_detection" : { "disabled" : False , # default "start_of_speech_sensitivity" : types . StartSensitivity . START_SENSITIVITY_LOW , "end_of_speech_sensitivity" : types . EndSensitivity . END_SENSITIVITY_LOW , "prefix_padding_ms" : 20 , "silence_duration_ms" : 100 , } } } JavaScript import { GoogleGenAI , Modality , StartSensitivity , EndSensitivity } from '@google/genai' ; const config = { responseModalities : [ Modality . TEXT ], realtimeInputConfig : { automaticActivityDetection : { disabled : false , // default startOfSpeechSensitivity : StartSensitivity . START_SENSITIVITY_LOW , endOfSpeechSensitivity : EndSensitivity . END_SENSITIVITY_LOW , prefixPaddingMs : 20 , silenceDurationMs : 100 , } } }; Disable automatic VAD Alternatively, the automatic VAD can be disabled by setting realtimeInputConfig.automaticActivityDetection.disabled to true in the setup message. In this configuration the client is responsible for detecting user speech and sending activityStart and activityEnd messages at the appropriate times. An audioStreamEnd isn't sent in this configuration. Instead, any interruption of the stream is marked by an activityEnd message. Python config = { "response_modalities" : [ "TEXT" ], "realtime_input_config" : { "automatic_activity_detection" : { "disabled" : True }}, } async with client . aio . live . connect ( model = model , config = config ) as session : # ... await session . send_realtime_input ( activity_start = types . ActivityStart ()) await session . send_realtime_input ( audio = types . Blob ( data = audio_bytes , mime_type = "audio/pcm;rate=16000" ) ) await session . send_realtime_input ( activity_end = types . ActivityEnd ()) # ... JavaScript const config = { responseModalities : [ Modality . TEXT ], realtimeInputConfig : { automaticActivityDetection : { disabled : true , } } }; session . sendRealtimeInput ({ activityStart : {} }) session . sendRealtimeInput ( { audio : { data : base64Audio , mimeType : "audio/pcm;rate=16000" } } ); session . sendRealtimeInput ({ activityEnd : {} }) Token count You can find the total number of consumed tokens in the usageMetadata field of the returned server message. Python async for message in session . receive (): # The server will periodically send messages that include UsageMetadata. if message . usage_metadata : usage = message . usage_metadata print ( f "Used { usage . total_token_count } tokens in total. Response token breakdown:" ) for detail in usage . response_tokens_details : match detail : case types . ModalityTokenCount ( modality = modality , token_count = count ): print ( f " { modality } : { count } " ) JavaScript const turns = await handleTurn (); for ( const turn of turns ) { if ( turn . usageMetadata ) { console . debug ( 'Used %s tokens in total. Response token breakdown:\n' , turn . usageMetadata . totalTokenCount ); for ( const detail of turn . usageMetadata . responseTokensDetails ) { console . debug ( '%s\n' , detail ); } } } Media resolution You can specify the media resolution for the input media by setting the mediaResolution field as part of the session configuration: Python from google.genai import types config = { "response_modalities" : [ "AUDIO" ], "media_resolution" : types . MediaResolution . MEDIA_RESOLUTION_LOW , } JavaScript import { GoogleGenAI , Modality , MediaResolution } from '@google/genai' ; const config = { responseModalities : [ Modality . TEXT ], mediaResolution : MediaResolution . MEDIA_RESOLUTION_LOW , }; Limitations Consider the following limitations of the Live API when you plan your project. Response modalities You can only set one response modality ( TEXT or AUDIO ) per session in the session configuration. Setting both results in a config error message. This means that you can configure the model to respond with either text or audio, but not both in the same session. Client authentication The Live API only provides server-to-server authentication by default. If you're implementing your Live API application using a client-to-server approach , you need to use ephemeral tokens to mitigate security risks. Session duration Audio-only sessions are limited to 15 minutes, and audio plus video sessions are limited to 2 minutes. However, you can configure different session management techniques for unlimited extensions on session duration. Context window A session has a context window limit of: 128k tokens for native audio output models 32k tokens for other Live API models Supported languages Live API supports the following languages. Note: Native audio output models automatically choose the appropriate language and don't support explicitly setting the language code. Language BCP-47 Code Language BCP-47 Code German (Germany) de-DE English (Australia)* en-AU English (UK)* en-GB English (India) en-IN English (US) en-US Spanish (US) es-US French (France) fr-FR Hindi (India) hi-IN Portuguese (Brazil) pt-BR Arabic (Generic) ar-XA Spanish (Spain)* es-ES French (Canada)* fr-CA Indonesian (Indonesia) id-ID Italian (Italy) it-IT Japanese (Japan) ja-JP Turkish (Turkey) tr-TR Vietnamese (Vietnam) vi-VN Bengali (India) bn-IN Gujarati (India)* gu-IN Kannada (India)* kn-IN Marathi (India) mr-IN Malayalam (India)* ml-IN Tamil (India) ta-IN Telugu (India) te-IN Dutch (Netherlands) nl-NL Korean (South Korea) ko-KR Mandarin Chinese (China)* cmn-CN Polish (Poland) pl-PL Russian (Russia) ru-RU Thai (Thailand) th-TH Languages marked with an asterisk (*) are not available for Native audio . What's next Read the Tool Use and Session Management guides for essential information on using the Live API effectively. Try the Live API in Google AI Studio . For more info about the Live API models, see Gemini 2.0 Flash Live and Gemini 2.5 Flash Native Audio on the Models page. Try more examples in the Live API cookbook , the Live API Tools cookbook , and the Live API Get Started script . Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-08 UTC. \ No newline at end of file diff --git a/text_content/docs_live-guide_aa05c114.txt b/text_content/docs_live-guide_aa05c114.txt new file mode 100644 index 0000000000000000000000000000000000000000..6502a70c01d8fe8b2d10eec50b009facbc0b72bd --- /dev/null +++ b/text_content/docs_live-guide_aa05c114.txt @@ -0,0 +1,5 @@ +URL: https://ai.google.dev/gemini-api/docs/live-guide#proactive-audio +Title: Live API capabilities guide  |  Gemini API  |  Google AI for Developers +================================================== + +Live API capabilities guide | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Live API capabilities guide Preview: The Live API is in preview. This is a comprehensive guide that covers capabilities and configurations available with the Live API. See Get started with Live API page for a overview and sample code for common use cases. Before you begin Familiarize yourself with core concepts: If you haven't already done so, read the Get started with Live API page first. This will introduce you to the fundamental principles of the Live API, how it works, and the distinction between the different models and their corresponding audio generation methods ( native audio or half-cascade). Try the Live API in AI Studio: You may find it useful to try the Live API in Google AI Studio before you start building. To use the Live API in Google AI Studio, select Stream . Establishing a connection The following example shows how to create a connection with an API key: Python import asyncio from google import genai client = genai . Client () model = "gemini-live-2.5-flash-preview" config = { "response_modalities" : [ "TEXT" ]} async def main (): async with client . aio . live . connect ( model = model , config = config ) as session : print ( "Session started" ) if __name__ == "__main__" : asyncio . run ( main ()) JavaScript import { GoogleGenAI , Modality } from '@google/genai' ; const ai = new GoogleGenAI ({}); const model = 'gemini-live-2.5-flash-preview' ; const config = { responseModalities : [ Modality . TEXT ] }; async function main () { const session = await ai . live . connect ({ model : model , callbacks : { onopen : function () { console . debug ( 'Opened' ); }, onmessage : function ( message ) { console . debug ( message ); }, onerror : function ( e ) { console . debug ( 'Error:' , e . message ); }, onclose : function ( e ) { console . debug ( 'Close:' , e . reason ); }, }, config : config , }); // Send content... session . close (); } main (); Note: You can only set one modality in the response_modalities field. This means that you can configure the model to respond with either text or audio, but not both in the same session. Interaction modalities The following sections provide examples and supporting context for the different input and output modalities available in Live API. Sending and receiving text Here's how you can send and receive text: Python import asyncio from google import genai client = genai . Client () model = "gemini-live-2.5-flash-preview" config = { "response_modalities" : [ "TEXT" ]} async def main (): async with client . aio . live . connect ( model = model , config = config ) as session : message = "Hello, how are you?" await session . send_client_content ( turns = { "role" : "user" , "parts" : [{ "text" : message }]}, turn_complete = True ) async for response in session . receive (): if response . text is not None : print ( response . text , end = "" ) if __name__ == "__main__" : asyncio . run ( main ()) JavaScript import { GoogleGenAI , Modality } from '@google/genai' ; const ai = new GoogleGenAI ({}); const model = 'gemini-live-2.5-flash-preview' ; const config = { responseModalities : [ Modality . TEXT ] }; async function live () { const responseQueue = []; async function waitMessage () { let done = false ; let message = undefined ; while ( ! done ) { message = responseQueue . shift (); if ( message ) { done = true ; } else { await new Promise (( resolve ) = > setTimeout ( resolve , 100 )); } } return message ; } async function handleTurn () { const turns = []; let done = false ; while ( ! done ) { const message = await waitMessage (); turns . push ( message ); if ( message . serverContent && message . serverContent . turnComplete ) { done = true ; } } return turns ; } const session = await ai . live . connect ({ model : model , callbacks : { onopen : function () { console . debug ( 'Opened' ); }, onmessage : function ( message ) { responseQueue . push ( message ); }, onerror : function ( e ) { console . debug ( 'Error:' , e . message ); }, onclose : function ( e ) { console . debug ( 'Close:' , e . reason ); }, }, config : config , }); const inputTurns = 'Hello how are you?' ; session . sendClientContent ({ turns : inputTurns }); const turns = await handleTurn (); for ( const turn of turns ) { if ( turn . text ) { console . debug ( 'Received text: %s\n' , turn . text ); } else if ( turn . data ) { console . debug ( 'Received inline data: %s\n' , turn . data ); } } session . close (); } async function main () { await live (). catch (( e ) = > console . error ( 'got error' , e )); } main (); Incremental content updates Use incremental updates to send text input, establish session context, or restore session context. For short contexts you can send turn-by-turn interactions to represent the exact sequence of events: Python turns = [ { "role" : "user" , "parts" : [{ "text" : "What is the capital of France?" }]}, { "role" : "model" , "parts" : [{ "text" : "Paris" }]}, ] await session . send_client_content ( turns = turns , turn_complete = False ) turns = [{ "role" : "user" , "parts" : [{ "text" : "What is the capital of Germany?" }]}] await session . send_client_content ( turns = turns , turn_complete = True ) JavaScript let inputTurns = [ { "role" : "user" , "parts" : [{ "text" : "What is the capital of France?" }] }, { "role" : "model" , "parts" : [{ "text" : "Paris" }] }, ] session . sendClientContent ({ turns : inputTurns , turnComplete : false }) inputTurns = [{ "role" : "user" , "parts" : [{ "text" : "What is the capital of Germany?" }] }] session . sendClientContent ({ turns : inputTurns , turnComplete : true }) For longer contexts it's recommended to provide a single message summary to free up the context window for subsequent interactions. See Session Resumption for another method for loading session context. Sending and receiving audio The most common audio example, audio-to-audio , is covered in the Getting started guide. Here's an audio-to-text example that reads a WAV file, sends it in the correct format and receives text output: Python # Test file: https://storage.googleapis.com/generativeai-downloads/data/16000.wav # Install helpers for converting files: pip install librosa soundfile import asyncio import io from pathlib import Path from google import genai from google.genai import types import soundfile as sf import librosa client = genai . Client () model = "gemini-live-2.5-flash-preview" config = { "response_modalities" : [ "TEXT" ]} async def main (): async with client . aio . live . connect ( model = model , config = config ) as session : buffer = io . BytesIO () y , sr = librosa . load ( "sample.wav" , sr = 16000 ) sf . write ( buffer , y , sr , format = 'RAW' , subtype = 'PCM_16' ) buffer . seek ( 0 ) audio_bytes = buffer . read () # If already in correct format, you can use this: # audio_bytes = Path("sample.pcm").read_bytes() await session . send_realtime_input ( audio = types . Blob ( data = audio_bytes , mime_type = "audio/pcm;rate=16000" ) ) async for response in session . receive (): if response . text is not None : print ( response . text ) if __name__ == "__main__" : asyncio . run ( main ()) JavaScript // Test file: https://storage.googleapis.com/generativeai-downloads/data/16000.wav // Install helpers for converting files: npm install wavefile import { GoogleGenAI , Modality } from '@google/genai' ; import * as fs from "node:fs" ; import pkg from 'wavefile' ; const { WaveFile } = pkg ; const ai = new GoogleGenAI ({}); const model = 'gemini-live-2.5-flash-preview' ; const config = { responseModalities : [ Modality . TEXT ] }; async function live () { const responseQueue = []; async function waitMessage () { let done = false ; let message = undefined ; while ( ! done ) { message = responseQueue . shift (); if ( message ) { done = true ; } else { await new Promise (( resolve ) = > setTimeout ( resolve , 100 )); } } return message ; } async function handleTurn () { const turns = []; let done = false ; while ( ! done ) { const message = await waitMessage (); turns . push ( message ); if ( message . serverContent && message . serverContent . turnComplete ) { done = true ; } } return turns ; } const session = await ai . live . connect ({ model : model , callbacks : { onopen : function () { console . debug ( 'Opened' ); }, onmessage : function ( message ) { responseQueue . push ( message ); }, onerror : function ( e ) { console . debug ( 'Error:' , e . message ); }, onclose : function ( e ) { console . debug ( 'Close:' , e . reason ); }, }, config : config , }); // Send Audio Chunk const fileBuffer = fs . readFileSync ( "sample.wav" ); // Ensure audio conforms to API requirements (16-bit PCM, 16kHz, mono) const wav = new WaveFile (); wav . fromBuffer ( fileBuffer ); wav . toSampleRate ( 16000 ); wav . toBitDepth ( "16" ); const base64Audio = wav . toBase64 (); // If already in correct format, you can use this: // const fileBuffer = fs.readFileSync("sample.pcm"); // const base64Audio = Buffer.from(fileBuffer).toString('base64'); session . sendRealtimeInput ( { audio : { data : base64Audio , mimeType : "audio/pcm;rate=16000" } } ); const turns = await handleTurn (); for ( const turn of turns ) { if ( turn . text ) { console . debug ( 'Received text: %s\n' , turn . text ); } else if ( turn . data ) { console . debug ( 'Received inline data: %s\n' , turn . data ); } } session . close (); } async function main () { await live (). catch (( e ) = > console . error ( 'got error' , e )); } main (); And here is a text-to-audio example. You can receive audio by setting AUDIO as response modality. This example saves the received data as WAV file: Python import asyncio import wave from google import genai client = genai . Client () model = "gemini-live-2.5-flash-preview" config = { "response_modalities" : [ "AUDIO" ]} async def main (): async with client . aio . live . connect ( model = model , config = config ) as session : wf = wave . open ( "audio.wav" , "wb" ) wf . setnchannels ( 1 ) wf . setsampwidth ( 2 ) wf . setframerate ( 24000 ) message = "Hello how are you?" await session . send_client_content ( turns = { "role" : "user" , "parts" : [{ "text" : message }]}, turn_complete = True ) async for response in session . receive (): if response . data is not None : wf . writeframes ( response . data ) # Un-comment this code to print audio data info # if response.server_content.model_turn is not None: # print(response.server_content.model_turn.parts[0].inline_data.mime_type) wf . close () if __name__ == "__main__" : asyncio . run ( main ()) JavaScript import { GoogleGenAI , Modality } from '@google/genai' ; import * as fs from "node:fs" ; import pkg from 'wavefile' ; const { WaveFile } = pkg ; const ai = new GoogleGenAI ({}); const model = 'gemini-live-2.5-flash-preview' ; const config = { responseModalities : [ Modality . AUDIO ] }; async function live () { const responseQueue = []; async function waitMessage () { let done = false ; let message = undefined ; while ( ! done ) { message = responseQueue . shift (); if ( message ) { done = true ; } else { await new Promise (( resolve ) = > setTimeout ( resolve , 100 )); } } return message ; } async function handleTurn () { const turns = []; let done = false ; while ( ! done ) { const message = await waitMessage (); turns . push ( message ); if ( message . serverContent && message . serverContent . turnComplete ) { done = true ; } } return turns ; } const session = await ai . live . connect ({ model : model , callbacks : { onopen : function () { console . debug ( 'Opened' ); }, onmessage : function ( message ) { responseQueue . push ( message ); }, onerror : function ( e ) { console . debug ( 'Error:' , e . message ); }, onclose : function ( e ) { console . debug ( 'Close:' , e . reason ); }, }, config : config , }); const inputTurns = 'Hello how are you?' ; session . sendClientContent ({ turns : inputTurns }); const turns = await handleTurn (); // Combine audio data strings and save as wave file const combinedAudio = turns . reduce (( acc , turn ) = > { if ( turn . data ) { const buffer = Buffer . from ( turn . data , 'base64' ); const intArray = new Int16Array ( buffer . buffer , buffer . byteOffset , buffer . byteLength / Int16Array . BYTES_PER_ELEMENT ); return acc . concat ( Array . from ( intArray )); } return acc ; }, []); const audioBuffer = new Int16Array ( combinedAudio ); const wf = new WaveFile (); wf . fromScratch ( 1 , 24000 , '16' , audioBuffer ); fs . writeFileSync ( 'output.wav' , wf . toBuffer ()); session . close (); } async function main () { await live (). catch (( e ) = > console . error ( 'got error' , e )); } main (); Audio formats Audio data in the Live API is always raw, little-endian, 16-bit PCM. Audio output always uses a sample rate of 24kHz. Input audio is natively 16kHz, but the Live API will resample if needed so any sample rate can be sent. To convey the sample rate of input audio, set the MIME type of each audio-containing Blob to a value like audio/pcm;rate=16000 . Audio transcriptions You can enable transcription of the model's audio output by sending output_audio_transcription in the setup config. The transcription language is inferred from the model's response. Python import asyncio from google import genai from google.genai import types client = genai . Client () model = "gemini-live-2.5-flash-preview" config = { "response_modalities" : [ "AUDIO" ], "output_audio_transcription" : {} } async def main (): async with client . aio . live . connect ( model = model , config = config ) as session : message = "Hello? Gemini are you there?" await session . send_client_content ( turns = { "role" : "user" , "parts" : [{ "text" : message }]}, turn_complete = True ) async for response in session . receive (): if response . server_content . model_turn : print ( "Model turn:" , response . server_content . model_turn ) if response . server_content . output_transcription : print ( "Transcript:" , response . server_content . output_transcription . text ) if __name__ == "__main__" : asyncio . run ( main ()) JavaScript import { GoogleGenAI , Modality } from '@google/genai' ; const ai = new GoogleGenAI ({}); const model = 'gemini-live-2.5-flash-preview' ; const config = { responseModalities : [ Modality . AUDIO ], outputAudioTranscription : {} }; async function live () { const responseQueue = []; async function waitMessage () { let done = false ; let message = undefined ; while ( ! done ) { message = responseQueue . shift (); if ( message ) { done = true ; } else { await new Promise (( resolve ) = > setTimeout ( resolve , 100 )); } } return message ; } async function handleTurn () { const turns = []; let done = false ; while ( ! done ) { const message = await waitMessage (); turns . push ( message ); if ( message . serverContent && message . serverContent . turnComplete ) { done = true ; } } return turns ; } const session = await ai . live . connect ({ model : model , callbacks : { onopen : function () { console . debug ( 'Opened' ); }, onmessage : function ( message ) { responseQueue . push ( message ); }, onerror : function ( e ) { console . debug ( 'Error:' , e . message ); }, onclose : function ( e ) { console . debug ( 'Close:' , e . reason ); }, }, config : config , }); const inputTurns = 'Hello how are you?' ; session . sendClientContent ({ turns : inputTurns }); const turns = await handleTurn (); for ( const turn of turns ) { if ( turn . serverContent && turn . serverContent . outputTranscription ) { console . debug ( 'Received output transcription: %s\n' , turn . serverContent . outputTranscription . text ); } } session . close (); } async function main () { await live (). catch (( e ) = > console . error ( 'got error' , e )); } main (); You can enable transcription of the audio input by sending input_audio_transcription in setup config. Python import asyncio from pathlib import Path from google import genai from google.genai import types client = genai . Client () model = "gemini-live-2.5-flash-preview" config = { "response_modalities" : [ "TEXT" ], "input_audio_transcription" : {}, } async def main (): async with client . aio . live . connect ( model = model , config = config ) as session : audio_data = Path ( "16000.pcm" ) . read_bytes () await session . send_realtime_input ( audio = types . Blob ( data = audio_data , mime_type = 'audio/pcm;rate=16000' ) ) async for msg in session . receive (): if msg . server_content . input_transcription : print ( 'Transcript:' , msg . server_content . input_transcription . text ) if __name__ == "__main__" : asyncio . run ( main ()) JavaScript import { GoogleGenAI , Modality } from '@google/genai' ; import * as fs from "node:fs" ; import pkg from 'wavefile' ; const { WaveFile } = pkg ; const ai = new GoogleGenAI ({}); const model = 'gemini-live-2.5-flash-preview' ; const config = { responseModalities : [ Modality . TEXT ], inputAudioTranscription : {} }; async function live () { const responseQueue = []; async function waitMessage () { let done = false ; let message = undefined ; while ( ! done ) { message = responseQueue . shift (); if ( message ) { done = true ; } else { await new Promise (( resolve ) = > setTimeout ( resolve , 100 )); } } return message ; } async function handleTurn () { const turns = []; let done = false ; while ( ! done ) { const message = await waitMessage (); turns . push ( message ); if ( message . serverContent && message . serverContent . turnComplete ) { done = true ; } } return turns ; } const session = await ai . live . connect ({ model : model , callbacks : { onopen : function () { console . debug ( 'Opened' ); }, onmessage : function ( message ) { responseQueue . push ( message ); }, onerror : function ( e ) { console . debug ( 'Error:' , e . message ); }, onclose : function ( e ) { console . debug ( 'Close:' , e . reason ); }, }, config : config , }); // Send Audio Chunk const fileBuffer = fs . readFileSync ( "16000.wav" ); // Ensure audio conforms to API requirements (16-bit PCM, 16kHz, mono) const wav = new WaveFile (); wav . fromBuffer ( fileBuffer ); wav . toSampleRate ( 16000 ); wav . toBitDepth ( "16" ); const base64Audio = wav . toBase64 (); // If already in correct format, you can use this: // const fileBuffer = fs.readFileSync("sample.pcm"); // const base64Audio = Buffer.from(fileBuffer).toString('base64'); session . sendRealtimeInput ( { audio : { data : base64Audio , mimeType : "audio/pcm;rate=16000" } } ); const turns = await handleTurn (); for ( const turn of turns ) { if ( turn . serverContent && turn . serverContent . outputTranscription ) { console . log ( "Transcription" ) console . log ( turn . serverContent . outputTranscription . text ); } } for ( const turn of turns ) { if ( turn . text ) { console . debug ( 'Received text: %s\n' , turn . text ); } else if ( turn . data ) { console . debug ( 'Received inline data: %s\n' , turn . data ); } else if ( turn . serverContent && turn . serverContent . inputTranscription ) { console . debug ( 'Received input transcription: %s\n' , turn . serverContent . inputTranscription . text ); } } session . close (); } async function main () { await live (). catch (( e ) = > console . error ( 'got error' , e )); } main (); Stream audio and video To see an example of how to use the Live API in a streaming audio and video format, run the "Live API - Get Started" file in the cookbooks repository: View on Colab Change voice and language The Live API models each support a different set of voices. Half-cascade supports Puck, Charon, Kore, Fenrir, Aoede, Leda, Orus, and Zephyr. Native audio supports a much longer list (identical to the TTS model list ). You can listen to all the voices in AI Studio . To specify a voice, set the voice name within the speechConfig object as part of the session configuration: Python config = { "response_modalities" : [ "AUDIO" ], "speech_config" : { "voice_config" : { "prebuilt_voice_config" : { "voice_name" : "Kore" }} }, } JavaScript const config = { responseModalities : [ Modality . AUDIO ], speechConfig : { voiceConfig : { prebuiltVoiceConfig : { voiceName : "Kore" } } } }; Note: If you're using the generateContent API, the set of available voices is slightly different. See the audio generation guide for generateContent audio generation voices. The Live API supports multiple languages . To change the language, set the language code within the speechConfig object as part of the session configuration: Python config = { "response_modalities" : [ "AUDIO" ], "speech_config" : { "language_code" : "de-DE" } } JavaScript const config = { responseModalities : [ Modality . AUDIO ], speechConfig : { languageCode : "de-DE" } }; Note: Native audio output models automatically choose the appropriate language and don't support explicitly setting the language code. Native audio capabilities The following capabilities are only available with native audio. You can learn more about native audio in Choose a model and audio generation . Note: Native audio models currently have limited tool use support. See Overview of supported tools for details. How to use native audio output To use native audio output, configure one of the native audio models and set response_modalities to AUDIO . See Send and receive audio for a full example. Python model = "gemini-2.5-flash-preview-native-audio-dialog" config = types . LiveConnectConfig ( response_modalities = [ "AUDIO" ]) async with client . aio . live . connect ( model = model , config = config ) as session : # Send audio input and receive audio JavaScript const model = 'gemini-2.5-flash-preview-native-audio-dialog' ; const config = { responseModalities : [ Modality . AUDIO ] }; async function main () { const session = await ai . live . connect ({ model : model , config : config , callbacks : ..., }); // Send audio input and receive audio session . close (); } main (); Affective dialog This feature lets Gemini adapt its response style to the input expression and tone. To use affective dialog, set the api version to v1alpha and set enable_affective_dialog to true in the setup message: Python client = genai . Client ( http_options = { "api_version" : "v1alpha" }) config = types . LiveConnectConfig ( response_modalities = [ "AUDIO" ], enable_affective_dialog = True ) JavaScript const ai = new GoogleGenAI ({ httpOptions : { "apiVersion" : "v1alpha" } }); const config = { responseModalities : [ Modality . AUDIO ], enableAffectiveDialog : true }; Note that affective dialog is currently only supported by the native audio output models. Proactive audio When this feature is enabled, Gemini can proactively decide not to respond if the content is not relevant. To use it, set the api version to v1alpha and configure the proactivity field in the setup message and set proactive_audio to true : Python client = genai . Client ( http_options = { "api_version" : "v1alpha" }) config = types . LiveConnectConfig ( response_modalities = [ "AUDIO" ], proactivity = { 'proactive_audio' : True } ) JavaScript const ai = new GoogleGenAI ({ httpOptions : { "apiVersion" : "v1alpha" } }); const config = { responseModalities : [ Modality . AUDIO ], proactivity : { proactiveAudio : true } } Note that proactive audio is currently only supported by the native audio output models. Native audio output with thinking Native audio output supports thinking capabilities , available via a separate model gemini-2.5-flash-exp-native-audio-thinking-dialog . See Send and receive audio for a full example. Python model = "gemini-2.5-flash-exp-native-audio-thinking-dialog" config = types . LiveConnectConfig ( response_modalities = [ "AUDIO" ]) async with client . aio . live . connect ( model = model , config = config ) as session : # Send audio input and receive audio JavaScript const model = 'gemini-2.5-flash-exp-native-audio-thinking-dialog' ; const config = { responseModalities : [ Modality . AUDIO ] }; async function main () { const session = await ai . live . connect ({ model : model , config : config , callbacks : ..., }); // Send audio input and receive audio session . close (); } main (); Voice Activity Detection (VAD) Voice Activity Detection (VAD) allows the model to recognize when a person is speaking. This is essential for creating natural conversations, as it allows a user to interrupt the model at any time. When VAD detects an interruption, the ongoing generation is canceled and discarded. Only the information already sent to the client is retained in the session history. The server then sends a BidiGenerateContentServerContent message to report the interruption. The Gemini server then discards any pending function calls and sends a BidiGenerateContentServerContent message with the IDs of the canceled calls. Python async for response in session . receive (): if response . server_content . interrupted is True : # The generation was interrupted # If realtime playback is implemented in your application, # you should stop playing audio and clear queued playback here. JavaScript const turns = await handleTurn (); for ( const turn of turns ) { if ( turn . serverContent && turn . serverContent . interrupted ) { // The generation was interrupted // If realtime playback is implemented in your application, // you should stop playing audio and clear queued playback here. } } Automatic VAD By default, the model automatically performs VAD on a continuous audio input stream. VAD can be configured with the realtimeInputConfig.automaticActivityDetection field of the setup configuration . When the audio stream is paused for more than a second (for example, because the user switched off the microphone), an audioStreamEnd event should be sent to flush any cached audio. The client can resume sending audio data at any time. Python # example audio file to try: # URL = "https://storage.googleapis.com/generativeai-downloads/data/hello_are_you_there.pcm" # !wget -q $URL -O sample.pcm import asyncio from pathlib import Path from google import genai from google.genai import types client = genai . Client () model = "gemini-live-2.5-flash-preview" config = { "response_modalities" : [ "TEXT" ]} async def main (): async with client . aio . live . connect ( model = model , config = config ) as session : audio_bytes = Path ( "sample.pcm" ) . read_bytes () await session . send_realtime_input ( audio = types . Blob ( data = audio_bytes , mime_type = "audio/pcm;rate=16000" ) ) # if stream gets paused, send: # await session.send_realtime_input(audio_stream_end=True) async for response in session . receive (): if response . text is not None : print ( response . text ) if __name__ == "__main__" : asyncio . run ( main ()) JavaScript // example audio file to try: // URL = "https://storage.googleapis.com/generativeai-downloads/data/hello_are_you_there.pcm" // !wget -q $URL -O sample.pcm import { GoogleGenAI , Modality } from '@google/genai' ; import * as fs from "node:fs" ; const ai = new GoogleGenAI ({}); const model = 'gemini-live-2.5-flash-preview' ; const config = { responseModalities : [ Modality . TEXT ] }; async function live () { const responseQueue = []; async function waitMessage () { let done = false ; let message = undefined ; while ( ! done ) { message = responseQueue . shift (); if ( message ) { done = true ; } else { await new Promise (( resolve ) = > setTimeout ( resolve , 100 )); } } return message ; } async function handleTurn () { const turns = []; let done = false ; while ( ! done ) { const message = await waitMessage (); turns . push ( message ); if ( message . serverContent && message . serverContent . turnComplete ) { done = true ; } } return turns ; } const session = await ai . live . connect ({ model : model , callbacks : { onopen : function () { console . debug ( 'Opened' ); }, onmessage : function ( message ) { responseQueue . push ( message ); }, onerror : function ( e ) { console . debug ( 'Error:' , e . message ); }, onclose : function ( e ) { console . debug ( 'Close:' , e . reason ); }, }, config : config , }); // Send Audio Chunk const fileBuffer = fs . readFileSync ( "sample.pcm" ); const base64Audio = Buffer . from ( fileBuffer ). toString ( 'base64' ); session . sendRealtimeInput ( { audio : { data : base64Audio , mimeType : "audio/pcm;rate=16000" } } ); // if stream gets paused, send: // session.sendRealtimeInput({ audioStreamEnd: true }) const turns = await handleTurn (); for ( const turn of turns ) { if ( turn . text ) { console . debug ( 'Received text: %s\n' , turn . text ); } else if ( turn . data ) { console . debug ( 'Received inline data: %s\n' , turn . data ); } } session . close (); } async function main () { await live (). catch (( e ) = > console . error ( 'got error' , e )); } main (); With send_realtime_input , the API will respond to audio automatically based on VAD. While send_client_content adds messages to the model context in order, send_realtime_input is optimized for responsiveness at the expense of deterministic ordering. Automatic VAD configuration For more control over the VAD activity, you can configure the following parameters. See API reference for more info. Python from google.genai import types config = { "response_modalities" : [ "TEXT" ], "realtime_input_config" : { "automatic_activity_detection" : { "disabled" : False , # default "start_of_speech_sensitivity" : types . StartSensitivity . START_SENSITIVITY_LOW , "end_of_speech_sensitivity" : types . EndSensitivity . END_SENSITIVITY_LOW , "prefix_padding_ms" : 20 , "silence_duration_ms" : 100 , } } } JavaScript import { GoogleGenAI , Modality , StartSensitivity , EndSensitivity } from '@google/genai' ; const config = { responseModalities : [ Modality . TEXT ], realtimeInputConfig : { automaticActivityDetection : { disabled : false , // default startOfSpeechSensitivity : StartSensitivity . START_SENSITIVITY_LOW , endOfSpeechSensitivity : EndSensitivity . END_SENSITIVITY_LOW , prefixPaddingMs : 20 , silenceDurationMs : 100 , } } }; Disable automatic VAD Alternatively, the automatic VAD can be disabled by setting realtimeInputConfig.automaticActivityDetection.disabled to true in the setup message. In this configuration the client is responsible for detecting user speech and sending activityStart and activityEnd messages at the appropriate times. An audioStreamEnd isn't sent in this configuration. Instead, any interruption of the stream is marked by an activityEnd message. Python config = { "response_modalities" : [ "TEXT" ], "realtime_input_config" : { "automatic_activity_detection" : { "disabled" : True }}, } async with client . aio . live . connect ( model = model , config = config ) as session : # ... await session . send_realtime_input ( activity_start = types . ActivityStart ()) await session . send_realtime_input ( audio = types . Blob ( data = audio_bytes , mime_type = "audio/pcm;rate=16000" ) ) await session . send_realtime_input ( activity_end = types . ActivityEnd ()) # ... JavaScript const config = { responseModalities : [ Modality . TEXT ], realtimeInputConfig : { automaticActivityDetection : { disabled : true , } } }; session . sendRealtimeInput ({ activityStart : {} }) session . sendRealtimeInput ( { audio : { data : base64Audio , mimeType : "audio/pcm;rate=16000" } } ); session . sendRealtimeInput ({ activityEnd : {} }) Token count You can find the total number of consumed tokens in the usageMetadata field of the returned server message. Python async for message in session . receive (): # The server will periodically send messages that include UsageMetadata. if message . usage_metadata : usage = message . usage_metadata print ( f "Used { usage . total_token_count } tokens in total. Response token breakdown:" ) for detail in usage . response_tokens_details : match detail : case types . ModalityTokenCount ( modality = modality , token_count = count ): print ( f " { modality } : { count } " ) JavaScript const turns = await handleTurn (); for ( const turn of turns ) { if ( turn . usageMetadata ) { console . debug ( 'Used %s tokens in total. Response token breakdown:\n' , turn . usageMetadata . totalTokenCount ); for ( const detail of turn . usageMetadata . responseTokensDetails ) { console . debug ( '%s\n' , detail ); } } } Media resolution You can specify the media resolution for the input media by setting the mediaResolution field as part of the session configuration: Python from google.genai import types config = { "response_modalities" : [ "AUDIO" ], "media_resolution" : types . MediaResolution . MEDIA_RESOLUTION_LOW , } JavaScript import { GoogleGenAI , Modality , MediaResolution } from '@google/genai' ; const config = { responseModalities : [ Modality . TEXT ], mediaResolution : MediaResolution . MEDIA_RESOLUTION_LOW , }; Limitations Consider the following limitations of the Live API when you plan your project. Response modalities You can only set one response modality ( TEXT or AUDIO ) per session in the session configuration. Setting both results in a config error message. This means that you can configure the model to respond with either text or audio, but not both in the same session. Client authentication The Live API only provides server-to-server authentication by default. If you're implementing your Live API application using a client-to-server approach , you need to use ephemeral tokens to mitigate security risks. Session duration Audio-only sessions are limited to 15 minutes, and audio plus video sessions are limited to 2 minutes. However, you can configure different session management techniques for unlimited extensions on session duration. Context window A session has a context window limit of: 128k tokens for native audio output models 32k tokens for other Live API models Supported languages Live API supports the following languages. Note: Native audio output models automatically choose the appropriate language and don't support explicitly setting the language code. Language BCP-47 Code Language BCP-47 Code German (Germany) de-DE English (Australia)* en-AU English (UK)* en-GB English (India) en-IN English (US) en-US Spanish (US) es-US French (France) fr-FR Hindi (India) hi-IN Portuguese (Brazil) pt-BR Arabic (Generic) ar-XA Spanish (Spain)* es-ES French (Canada)* fr-CA Indonesian (Indonesia) id-ID Italian (Italy) it-IT Japanese (Japan) ja-JP Turkish (Turkey) tr-TR Vietnamese (Vietnam) vi-VN Bengali (India) bn-IN Gujarati (India)* gu-IN Kannada (India)* kn-IN Marathi (India) mr-IN Malayalam (India)* ml-IN Tamil (India) ta-IN Telugu (India) te-IN Dutch (Netherlands) nl-NL Korean (South Korea) ko-KR Mandarin Chinese (China)* cmn-CN Polish (Poland) pl-PL Russian (Russia) ru-RU Thai (Thailand) th-TH Languages marked with an asterisk (*) are not available for Native audio . What's next Read the Tool Use and Session Management guides for essential information on using the Live API effectively. Try the Live API in Google AI Studio . For more info about the Live API models, see Gemini 2.0 Flash Live and Gemini 2.5 Flash Native Audio on the Models page. Try more examples in the Live API cookbook , the Live API Tools cookbook , and the Live API Get Started script . Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-08 UTC. \ No newline at end of file diff --git a/text_content/docs_live-guide_ad162ad3.txt b/text_content/docs_live-guide_ad162ad3.txt new file mode 100644 index 0000000000000000000000000000000000000000..079ec86c52d6ccf157e567d641372acb1135db8d --- /dev/null +++ b/text_content/docs_live-guide_ad162ad3.txt @@ -0,0 +1,5 @@ +URL: https://ai.google.dev/gemini-api/docs/live-guide#supported-languages +Title: Live API capabilities guide  |  Gemini API  |  Google AI for Developers +================================================== + +Live API capabilities guide | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Live API capabilities guide Preview: The Live API is in preview. This is a comprehensive guide that covers capabilities and configurations available with the Live API. See Get started with Live API page for a overview and sample code for common use cases. Before you begin Familiarize yourself with core concepts: If you haven't already done so, read the Get started with Live API page first. This will introduce you to the fundamental principles of the Live API, how it works, and the distinction between the different models and their corresponding audio generation methods ( native audio or half-cascade). Try the Live API in AI Studio: You may find it useful to try the Live API in Google AI Studio before you start building. To use the Live API in Google AI Studio, select Stream . Establishing a connection The following example shows how to create a connection with an API key: Python import asyncio from google import genai client = genai . Client () model = "gemini-live-2.5-flash-preview" config = { "response_modalities" : [ "TEXT" ]} async def main (): async with client . aio . live . connect ( model = model , config = config ) as session : print ( "Session started" ) if __name__ == "__main__" : asyncio . run ( main ()) JavaScript import { GoogleGenAI , Modality } from '@google/genai' ; const ai = new GoogleGenAI ({}); const model = 'gemini-live-2.5-flash-preview' ; const config = { responseModalities : [ Modality . TEXT ] }; async function main () { const session = await ai . live . connect ({ model : model , callbacks : { onopen : function () { console . debug ( 'Opened' ); }, onmessage : function ( message ) { console . debug ( message ); }, onerror : function ( e ) { console . debug ( 'Error:' , e . message ); }, onclose : function ( e ) { console . debug ( 'Close:' , e . reason ); }, }, config : config , }); // Send content... session . close (); } main (); Note: You can only set one modality in the response_modalities field. This means that you can configure the model to respond with either text or audio, but not both in the same session. Interaction modalities The following sections provide examples and supporting context for the different input and output modalities available in Live API. Sending and receiving text Here's how you can send and receive text: Python import asyncio from google import genai client = genai . Client () model = "gemini-live-2.5-flash-preview" config = { "response_modalities" : [ "TEXT" ]} async def main (): async with client . aio . live . connect ( model = model , config = config ) as session : message = "Hello, how are you?" await session . send_client_content ( turns = { "role" : "user" , "parts" : [{ "text" : message }]}, turn_complete = True ) async for response in session . receive (): if response . text is not None : print ( response . text , end = "" ) if __name__ == "__main__" : asyncio . run ( main ()) JavaScript import { GoogleGenAI , Modality } from '@google/genai' ; const ai = new GoogleGenAI ({}); const model = 'gemini-live-2.5-flash-preview' ; const config = { responseModalities : [ Modality . TEXT ] }; async function live () { const responseQueue = []; async function waitMessage () { let done = false ; let message = undefined ; while ( ! done ) { message = responseQueue . shift (); if ( message ) { done = true ; } else { await new Promise (( resolve ) = > setTimeout ( resolve , 100 )); } } return message ; } async function handleTurn () { const turns = []; let done = false ; while ( ! done ) { const message = await waitMessage (); turns . push ( message ); if ( message . serverContent && message . serverContent . turnComplete ) { done = true ; } } return turns ; } const session = await ai . live . connect ({ model : model , callbacks : { onopen : function () { console . debug ( 'Opened' ); }, onmessage : function ( message ) { responseQueue . push ( message ); }, onerror : function ( e ) { console . debug ( 'Error:' , e . message ); }, onclose : function ( e ) { console . debug ( 'Close:' , e . reason ); }, }, config : config , }); const inputTurns = 'Hello how are you?' ; session . sendClientContent ({ turns : inputTurns }); const turns = await handleTurn (); for ( const turn of turns ) { if ( turn . text ) { console . debug ( 'Received text: %s\n' , turn . text ); } else if ( turn . data ) { console . debug ( 'Received inline data: %s\n' , turn . data ); } } session . close (); } async function main () { await live (). catch (( e ) = > console . error ( 'got error' , e )); } main (); Incremental content updates Use incremental updates to send text input, establish session context, or restore session context. For short contexts you can send turn-by-turn interactions to represent the exact sequence of events: Python turns = [ { "role" : "user" , "parts" : [{ "text" : "What is the capital of France?" }]}, { "role" : "model" , "parts" : [{ "text" : "Paris" }]}, ] await session . send_client_content ( turns = turns , turn_complete = False ) turns = [{ "role" : "user" , "parts" : [{ "text" : "What is the capital of Germany?" }]}] await session . send_client_content ( turns = turns , turn_complete = True ) JavaScript let inputTurns = [ { "role" : "user" , "parts" : [{ "text" : "What is the capital of France?" }] }, { "role" : "model" , "parts" : [{ "text" : "Paris" }] }, ] session . sendClientContent ({ turns : inputTurns , turnComplete : false }) inputTurns = [{ "role" : "user" , "parts" : [{ "text" : "What is the capital of Germany?" }] }] session . sendClientContent ({ turns : inputTurns , turnComplete : true }) For longer contexts it's recommended to provide a single message summary to free up the context window for subsequent interactions. See Session Resumption for another method for loading session context. Sending and receiving audio The most common audio example, audio-to-audio , is covered in the Getting started guide. Here's an audio-to-text example that reads a WAV file, sends it in the correct format and receives text output: Python # Test file: https://storage.googleapis.com/generativeai-downloads/data/16000.wav # Install helpers for converting files: pip install librosa soundfile import asyncio import io from pathlib import Path from google import genai from google.genai import types import soundfile as sf import librosa client = genai . Client () model = "gemini-live-2.5-flash-preview" config = { "response_modalities" : [ "TEXT" ]} async def main (): async with client . aio . live . connect ( model = model , config = config ) as session : buffer = io . BytesIO () y , sr = librosa . load ( "sample.wav" , sr = 16000 ) sf . write ( buffer , y , sr , format = 'RAW' , subtype = 'PCM_16' ) buffer . seek ( 0 ) audio_bytes = buffer . read () # If already in correct format, you can use this: # audio_bytes = Path("sample.pcm").read_bytes() await session . send_realtime_input ( audio = types . Blob ( data = audio_bytes , mime_type = "audio/pcm;rate=16000" ) ) async for response in session . receive (): if response . text is not None : print ( response . text ) if __name__ == "__main__" : asyncio . run ( main ()) JavaScript // Test file: https://storage.googleapis.com/generativeai-downloads/data/16000.wav // Install helpers for converting files: npm install wavefile import { GoogleGenAI , Modality } from '@google/genai' ; import * as fs from "node:fs" ; import pkg from 'wavefile' ; const { WaveFile } = pkg ; const ai = new GoogleGenAI ({}); const model = 'gemini-live-2.5-flash-preview' ; const config = { responseModalities : [ Modality . TEXT ] }; async function live () { const responseQueue = []; async function waitMessage () { let done = false ; let message = undefined ; while ( ! done ) { message = responseQueue . shift (); if ( message ) { done = true ; } else { await new Promise (( resolve ) = > setTimeout ( resolve , 100 )); } } return message ; } async function handleTurn () { const turns = []; let done = false ; while ( ! done ) { const message = await waitMessage (); turns . push ( message ); if ( message . serverContent && message . serverContent . turnComplete ) { done = true ; } } return turns ; } const session = await ai . live . connect ({ model : model , callbacks : { onopen : function () { console . debug ( 'Opened' ); }, onmessage : function ( message ) { responseQueue . push ( message ); }, onerror : function ( e ) { console . debug ( 'Error:' , e . message ); }, onclose : function ( e ) { console . debug ( 'Close:' , e . reason ); }, }, config : config , }); // Send Audio Chunk const fileBuffer = fs . readFileSync ( "sample.wav" ); // Ensure audio conforms to API requirements (16-bit PCM, 16kHz, mono) const wav = new WaveFile (); wav . fromBuffer ( fileBuffer ); wav . toSampleRate ( 16000 ); wav . toBitDepth ( "16" ); const base64Audio = wav . toBase64 (); // If already in correct format, you can use this: // const fileBuffer = fs.readFileSync("sample.pcm"); // const base64Audio = Buffer.from(fileBuffer).toString('base64'); session . sendRealtimeInput ( { audio : { data : base64Audio , mimeType : "audio/pcm;rate=16000" } } ); const turns = await handleTurn (); for ( const turn of turns ) { if ( turn . text ) { console . debug ( 'Received text: %s\n' , turn . text ); } else if ( turn . data ) { console . debug ( 'Received inline data: %s\n' , turn . data ); } } session . close (); } async function main () { await live (). catch (( e ) = > console . error ( 'got error' , e )); } main (); And here is a text-to-audio example. You can receive audio by setting AUDIO as response modality. This example saves the received data as WAV file: Python import asyncio import wave from google import genai client = genai . Client () model = "gemini-live-2.5-flash-preview" config = { "response_modalities" : [ "AUDIO" ]} async def main (): async with client . aio . live . connect ( model = model , config = config ) as session : wf = wave . open ( "audio.wav" , "wb" ) wf . setnchannels ( 1 ) wf . setsampwidth ( 2 ) wf . setframerate ( 24000 ) message = "Hello how are you?" await session . send_client_content ( turns = { "role" : "user" , "parts" : [{ "text" : message }]}, turn_complete = True ) async for response in session . receive (): if response . data is not None : wf . writeframes ( response . data ) # Un-comment this code to print audio data info # if response.server_content.model_turn is not None: # print(response.server_content.model_turn.parts[0].inline_data.mime_type) wf . close () if __name__ == "__main__" : asyncio . run ( main ()) JavaScript import { GoogleGenAI , Modality } from '@google/genai' ; import * as fs from "node:fs" ; import pkg from 'wavefile' ; const { WaveFile } = pkg ; const ai = new GoogleGenAI ({}); const model = 'gemini-live-2.5-flash-preview' ; const config = { responseModalities : [ Modality . AUDIO ] }; async function live () { const responseQueue = []; async function waitMessage () { let done = false ; let message = undefined ; while ( ! done ) { message = responseQueue . shift (); if ( message ) { done = true ; } else { await new Promise (( resolve ) = > setTimeout ( resolve , 100 )); } } return message ; } async function handleTurn () { const turns = []; let done = false ; while ( ! done ) { const message = await waitMessage (); turns . push ( message ); if ( message . serverContent && message . serverContent . turnComplete ) { done = true ; } } return turns ; } const session = await ai . live . connect ({ model : model , callbacks : { onopen : function () { console . debug ( 'Opened' ); }, onmessage : function ( message ) { responseQueue . push ( message ); }, onerror : function ( e ) { console . debug ( 'Error:' , e . message ); }, onclose : function ( e ) { console . debug ( 'Close:' , e . reason ); }, }, config : config , }); const inputTurns = 'Hello how are you?' ; session . sendClientContent ({ turns : inputTurns }); const turns = await handleTurn (); // Combine audio data strings and save as wave file const combinedAudio = turns . reduce (( acc , turn ) = > { if ( turn . data ) { const buffer = Buffer . from ( turn . data , 'base64' ); const intArray = new Int16Array ( buffer . buffer , buffer . byteOffset , buffer . byteLength / Int16Array . BYTES_PER_ELEMENT ); return acc . concat ( Array . from ( intArray )); } return acc ; }, []); const audioBuffer = new Int16Array ( combinedAudio ); const wf = new WaveFile (); wf . fromScratch ( 1 , 24000 , '16' , audioBuffer ); fs . writeFileSync ( 'output.wav' , wf . toBuffer ()); session . close (); } async function main () { await live (). catch (( e ) = > console . error ( 'got error' , e )); } main (); Audio formats Audio data in the Live API is always raw, little-endian, 16-bit PCM. Audio output always uses a sample rate of 24kHz. Input audio is natively 16kHz, but the Live API will resample if needed so any sample rate can be sent. To convey the sample rate of input audio, set the MIME type of each audio-containing Blob to a value like audio/pcm;rate=16000 . Audio transcriptions You can enable transcription of the model's audio output by sending output_audio_transcription in the setup config. The transcription language is inferred from the model's response. Python import asyncio from google import genai from google.genai import types client = genai . Client () model = "gemini-live-2.5-flash-preview" config = { "response_modalities" : [ "AUDIO" ], "output_audio_transcription" : {} } async def main (): async with client . aio . live . connect ( model = model , config = config ) as session : message = "Hello? Gemini are you there?" await session . send_client_content ( turns = { "role" : "user" , "parts" : [{ "text" : message }]}, turn_complete = True ) async for response in session . receive (): if response . server_content . model_turn : print ( "Model turn:" , response . server_content . model_turn ) if response . server_content . output_transcription : print ( "Transcript:" , response . server_content . output_transcription . text ) if __name__ == "__main__" : asyncio . run ( main ()) JavaScript import { GoogleGenAI , Modality } from '@google/genai' ; const ai = new GoogleGenAI ({}); const model = 'gemini-live-2.5-flash-preview' ; const config = { responseModalities : [ Modality . AUDIO ], outputAudioTranscription : {} }; async function live () { const responseQueue = []; async function waitMessage () { let done = false ; let message = undefined ; while ( ! done ) { message = responseQueue . shift (); if ( message ) { done = true ; } else { await new Promise (( resolve ) = > setTimeout ( resolve , 100 )); } } return message ; } async function handleTurn () { const turns = []; let done = false ; while ( ! done ) { const message = await waitMessage (); turns . push ( message ); if ( message . serverContent && message . serverContent . turnComplete ) { done = true ; } } return turns ; } const session = await ai . live . connect ({ model : model , callbacks : { onopen : function () { console . debug ( 'Opened' ); }, onmessage : function ( message ) { responseQueue . push ( message ); }, onerror : function ( e ) { console . debug ( 'Error:' , e . message ); }, onclose : function ( e ) { console . debug ( 'Close:' , e . reason ); }, }, config : config , }); const inputTurns = 'Hello how are you?' ; session . sendClientContent ({ turns : inputTurns }); const turns = await handleTurn (); for ( const turn of turns ) { if ( turn . serverContent && turn . serverContent . outputTranscription ) { console . debug ( 'Received output transcription: %s\n' , turn . serverContent . outputTranscription . text ); } } session . close (); } async function main () { await live (). catch (( e ) = > console . error ( 'got error' , e )); } main (); You can enable transcription of the audio input by sending input_audio_transcription in setup config. Python import asyncio from pathlib import Path from google import genai from google.genai import types client = genai . Client () model = "gemini-live-2.5-flash-preview" config = { "response_modalities" : [ "TEXT" ], "input_audio_transcription" : {}, } async def main (): async with client . aio . live . connect ( model = model , config = config ) as session : audio_data = Path ( "16000.pcm" ) . read_bytes () await session . send_realtime_input ( audio = types . Blob ( data = audio_data , mime_type = 'audio/pcm;rate=16000' ) ) async for msg in session . receive (): if msg . server_content . input_transcription : print ( 'Transcript:' , msg . server_content . input_transcription . text ) if __name__ == "__main__" : asyncio . run ( main ()) JavaScript import { GoogleGenAI , Modality } from '@google/genai' ; import * as fs from "node:fs" ; import pkg from 'wavefile' ; const { WaveFile } = pkg ; const ai = new GoogleGenAI ({}); const model = 'gemini-live-2.5-flash-preview' ; const config = { responseModalities : [ Modality . TEXT ], inputAudioTranscription : {} }; async function live () { const responseQueue = []; async function waitMessage () { let done = false ; let message = undefined ; while ( ! done ) { message = responseQueue . shift (); if ( message ) { done = true ; } else { await new Promise (( resolve ) = > setTimeout ( resolve , 100 )); } } return message ; } async function handleTurn () { const turns = []; let done = false ; while ( ! done ) { const message = await waitMessage (); turns . push ( message ); if ( message . serverContent && message . serverContent . turnComplete ) { done = true ; } } return turns ; } const session = await ai . live . connect ({ model : model , callbacks : { onopen : function () { console . debug ( 'Opened' ); }, onmessage : function ( message ) { responseQueue . push ( message ); }, onerror : function ( e ) { console . debug ( 'Error:' , e . message ); }, onclose : function ( e ) { console . debug ( 'Close:' , e . reason ); }, }, config : config , }); // Send Audio Chunk const fileBuffer = fs . readFileSync ( "16000.wav" ); // Ensure audio conforms to API requirements (16-bit PCM, 16kHz, mono) const wav = new WaveFile (); wav . fromBuffer ( fileBuffer ); wav . toSampleRate ( 16000 ); wav . toBitDepth ( "16" ); const base64Audio = wav . toBase64 (); // If already in correct format, you can use this: // const fileBuffer = fs.readFileSync("sample.pcm"); // const base64Audio = Buffer.from(fileBuffer).toString('base64'); session . sendRealtimeInput ( { audio : { data : base64Audio , mimeType : "audio/pcm;rate=16000" } } ); const turns = await handleTurn (); for ( const turn of turns ) { if ( turn . serverContent && turn . serverContent . outputTranscription ) { console . log ( "Transcription" ) console . log ( turn . serverContent . outputTranscription . text ); } } for ( const turn of turns ) { if ( turn . text ) { console . debug ( 'Received text: %s\n' , turn . text ); } else if ( turn . data ) { console . debug ( 'Received inline data: %s\n' , turn . data ); } else if ( turn . serverContent && turn . serverContent . inputTranscription ) { console . debug ( 'Received input transcription: %s\n' , turn . serverContent . inputTranscription . text ); } } session . close (); } async function main () { await live (). catch (( e ) = > console . error ( 'got error' , e )); } main (); Stream audio and video To see an example of how to use the Live API in a streaming audio and video format, run the "Live API - Get Started" file in the cookbooks repository: View on Colab Change voice and language The Live API models each support a different set of voices. Half-cascade supports Puck, Charon, Kore, Fenrir, Aoede, Leda, Orus, and Zephyr. Native audio supports a much longer list (identical to the TTS model list ). You can listen to all the voices in AI Studio . To specify a voice, set the voice name within the speechConfig object as part of the session configuration: Python config = { "response_modalities" : [ "AUDIO" ], "speech_config" : { "voice_config" : { "prebuilt_voice_config" : { "voice_name" : "Kore" }} }, } JavaScript const config = { responseModalities : [ Modality . AUDIO ], speechConfig : { voiceConfig : { prebuiltVoiceConfig : { voiceName : "Kore" } } } }; Note: If you're using the generateContent API, the set of available voices is slightly different. See the audio generation guide for generateContent audio generation voices. The Live API supports multiple languages . To change the language, set the language code within the speechConfig object as part of the session configuration: Python config = { "response_modalities" : [ "AUDIO" ], "speech_config" : { "language_code" : "de-DE" } } JavaScript const config = { responseModalities : [ Modality . AUDIO ], speechConfig : { languageCode : "de-DE" } }; Note: Native audio output models automatically choose the appropriate language and don't support explicitly setting the language code. Native audio capabilities The following capabilities are only available with native audio. You can learn more about native audio in Choose a model and audio generation . Note: Native audio models currently have limited tool use support. See Overview of supported tools for details. How to use native audio output To use native audio output, configure one of the native audio models and set response_modalities to AUDIO . See Send and receive audio for a full example. Python model = "gemini-2.5-flash-preview-native-audio-dialog" config = types . LiveConnectConfig ( response_modalities = [ "AUDIO" ]) async with client . aio . live . connect ( model = model , config = config ) as session : # Send audio input and receive audio JavaScript const model = 'gemini-2.5-flash-preview-native-audio-dialog' ; const config = { responseModalities : [ Modality . AUDIO ] }; async function main () { const session = await ai . live . connect ({ model : model , config : config , callbacks : ..., }); // Send audio input and receive audio session . close (); } main (); Affective dialog This feature lets Gemini adapt its response style to the input expression and tone. To use affective dialog, set the api version to v1alpha and set enable_affective_dialog to true in the setup message: Python client = genai . Client ( http_options = { "api_version" : "v1alpha" }) config = types . LiveConnectConfig ( response_modalities = [ "AUDIO" ], enable_affective_dialog = True ) JavaScript const ai = new GoogleGenAI ({ httpOptions : { "apiVersion" : "v1alpha" } }); const config = { responseModalities : [ Modality . AUDIO ], enableAffectiveDialog : true }; Note that affective dialog is currently only supported by the native audio output models. Proactive audio When this feature is enabled, Gemini can proactively decide not to respond if the content is not relevant. To use it, set the api version to v1alpha and configure the proactivity field in the setup message and set proactive_audio to true : Python client = genai . Client ( http_options = { "api_version" : "v1alpha" }) config = types . LiveConnectConfig ( response_modalities = [ "AUDIO" ], proactivity = { 'proactive_audio' : True } ) JavaScript const ai = new GoogleGenAI ({ httpOptions : { "apiVersion" : "v1alpha" } }); const config = { responseModalities : [ Modality . AUDIO ], proactivity : { proactiveAudio : true } } Note that proactive audio is currently only supported by the native audio output models. Native audio output with thinking Native audio output supports thinking capabilities , available via a separate model gemini-2.5-flash-exp-native-audio-thinking-dialog . See Send and receive audio for a full example. Python model = "gemini-2.5-flash-exp-native-audio-thinking-dialog" config = types . LiveConnectConfig ( response_modalities = [ "AUDIO" ]) async with client . aio . live . connect ( model = model , config = config ) as session : # Send audio input and receive audio JavaScript const model = 'gemini-2.5-flash-exp-native-audio-thinking-dialog' ; const config = { responseModalities : [ Modality . AUDIO ] }; async function main () { const session = await ai . live . connect ({ model : model , config : config , callbacks : ..., }); // Send audio input and receive audio session . close (); } main (); Voice Activity Detection (VAD) Voice Activity Detection (VAD) allows the model to recognize when a person is speaking. This is essential for creating natural conversations, as it allows a user to interrupt the model at any time. When VAD detects an interruption, the ongoing generation is canceled and discarded. Only the information already sent to the client is retained in the session history. The server then sends a BidiGenerateContentServerContent message to report the interruption. The Gemini server then discards any pending function calls and sends a BidiGenerateContentServerContent message with the IDs of the canceled calls. Python async for response in session . receive (): if response . server_content . interrupted is True : # The generation was interrupted # If realtime playback is implemented in your application, # you should stop playing audio and clear queued playback here. JavaScript const turns = await handleTurn (); for ( const turn of turns ) { if ( turn . serverContent && turn . serverContent . interrupted ) { // The generation was interrupted // If realtime playback is implemented in your application, // you should stop playing audio and clear queued playback here. } } Automatic VAD By default, the model automatically performs VAD on a continuous audio input stream. VAD can be configured with the realtimeInputConfig.automaticActivityDetection field of the setup configuration . When the audio stream is paused for more than a second (for example, because the user switched off the microphone), an audioStreamEnd event should be sent to flush any cached audio. The client can resume sending audio data at any time. Python # example audio file to try: # URL = "https://storage.googleapis.com/generativeai-downloads/data/hello_are_you_there.pcm" # !wget -q $URL -O sample.pcm import asyncio from pathlib import Path from google import genai from google.genai import types client = genai . Client () model = "gemini-live-2.5-flash-preview" config = { "response_modalities" : [ "TEXT" ]} async def main (): async with client . aio . live . connect ( model = model , config = config ) as session : audio_bytes = Path ( "sample.pcm" ) . read_bytes () await session . send_realtime_input ( audio = types . Blob ( data = audio_bytes , mime_type = "audio/pcm;rate=16000" ) ) # if stream gets paused, send: # await session.send_realtime_input(audio_stream_end=True) async for response in session . receive (): if response . text is not None : print ( response . text ) if __name__ == "__main__" : asyncio . run ( main ()) JavaScript // example audio file to try: // URL = "https://storage.googleapis.com/generativeai-downloads/data/hello_are_you_there.pcm" // !wget -q $URL -O sample.pcm import { GoogleGenAI , Modality } from '@google/genai' ; import * as fs from "node:fs" ; const ai = new GoogleGenAI ({}); const model = 'gemini-live-2.5-flash-preview' ; const config = { responseModalities : [ Modality . TEXT ] }; async function live () { const responseQueue = []; async function waitMessage () { let done = false ; let message = undefined ; while ( ! done ) { message = responseQueue . shift (); if ( message ) { done = true ; } else { await new Promise (( resolve ) = > setTimeout ( resolve , 100 )); } } return message ; } async function handleTurn () { const turns = []; let done = false ; while ( ! done ) { const message = await waitMessage (); turns . push ( message ); if ( message . serverContent && message . serverContent . turnComplete ) { done = true ; } } return turns ; } const session = await ai . live . connect ({ model : model , callbacks : { onopen : function () { console . debug ( 'Opened' ); }, onmessage : function ( message ) { responseQueue . push ( message ); }, onerror : function ( e ) { console . debug ( 'Error:' , e . message ); }, onclose : function ( e ) { console . debug ( 'Close:' , e . reason ); }, }, config : config , }); // Send Audio Chunk const fileBuffer = fs . readFileSync ( "sample.pcm" ); const base64Audio = Buffer . from ( fileBuffer ). toString ( 'base64' ); session . sendRealtimeInput ( { audio : { data : base64Audio , mimeType : "audio/pcm;rate=16000" } } ); // if stream gets paused, send: // session.sendRealtimeInput({ audioStreamEnd: true }) const turns = await handleTurn (); for ( const turn of turns ) { if ( turn . text ) { console . debug ( 'Received text: %s\n' , turn . text ); } else if ( turn . data ) { console . debug ( 'Received inline data: %s\n' , turn . data ); } } session . close (); } async function main () { await live (). catch (( e ) = > console . error ( 'got error' , e )); } main (); With send_realtime_input , the API will respond to audio automatically based on VAD. While send_client_content adds messages to the model context in order, send_realtime_input is optimized for responsiveness at the expense of deterministic ordering. Automatic VAD configuration For more control over the VAD activity, you can configure the following parameters. See API reference for more info. Python from google.genai import types config = { "response_modalities" : [ "TEXT" ], "realtime_input_config" : { "automatic_activity_detection" : { "disabled" : False , # default "start_of_speech_sensitivity" : types . StartSensitivity . START_SENSITIVITY_LOW , "end_of_speech_sensitivity" : types . EndSensitivity . END_SENSITIVITY_LOW , "prefix_padding_ms" : 20 , "silence_duration_ms" : 100 , } } } JavaScript import { GoogleGenAI , Modality , StartSensitivity , EndSensitivity } from '@google/genai' ; const config = { responseModalities : [ Modality . TEXT ], realtimeInputConfig : { automaticActivityDetection : { disabled : false , // default startOfSpeechSensitivity : StartSensitivity . START_SENSITIVITY_LOW , endOfSpeechSensitivity : EndSensitivity . END_SENSITIVITY_LOW , prefixPaddingMs : 20 , silenceDurationMs : 100 , } } }; Disable automatic VAD Alternatively, the automatic VAD can be disabled by setting realtimeInputConfig.automaticActivityDetection.disabled to true in the setup message. In this configuration the client is responsible for detecting user speech and sending activityStart and activityEnd messages at the appropriate times. An audioStreamEnd isn't sent in this configuration. Instead, any interruption of the stream is marked by an activityEnd message. Python config = { "response_modalities" : [ "TEXT" ], "realtime_input_config" : { "automatic_activity_detection" : { "disabled" : True }}, } async with client . aio . live . connect ( model = model , config = config ) as session : # ... await session . send_realtime_input ( activity_start = types . ActivityStart ()) await session . send_realtime_input ( audio = types . Blob ( data = audio_bytes , mime_type = "audio/pcm;rate=16000" ) ) await session . send_realtime_input ( activity_end = types . ActivityEnd ()) # ... JavaScript const config = { responseModalities : [ Modality . TEXT ], realtimeInputConfig : { automaticActivityDetection : { disabled : true , } } }; session . sendRealtimeInput ({ activityStart : {} }) session . sendRealtimeInput ( { audio : { data : base64Audio , mimeType : "audio/pcm;rate=16000" } } ); session . sendRealtimeInput ({ activityEnd : {} }) Token count You can find the total number of consumed tokens in the usageMetadata field of the returned server message. Python async for message in session . receive (): # The server will periodically send messages that include UsageMetadata. if message . usage_metadata : usage = message . usage_metadata print ( f "Used { usage . total_token_count } tokens in total. Response token breakdown:" ) for detail in usage . response_tokens_details : match detail : case types . ModalityTokenCount ( modality = modality , token_count = count ): print ( f " { modality } : { count } " ) JavaScript const turns = await handleTurn (); for ( const turn of turns ) { if ( turn . usageMetadata ) { console . debug ( 'Used %s tokens in total. Response token breakdown:\n' , turn . usageMetadata . totalTokenCount ); for ( const detail of turn . usageMetadata . responseTokensDetails ) { console . debug ( '%s\n' , detail ); } } } Media resolution You can specify the media resolution for the input media by setting the mediaResolution field as part of the session configuration: Python from google.genai import types config = { "response_modalities" : [ "AUDIO" ], "media_resolution" : types . MediaResolution . MEDIA_RESOLUTION_LOW , } JavaScript import { GoogleGenAI , Modality , MediaResolution } from '@google/genai' ; const config = { responseModalities : [ Modality . TEXT ], mediaResolution : MediaResolution . MEDIA_RESOLUTION_LOW , }; Limitations Consider the following limitations of the Live API when you plan your project. Response modalities You can only set one response modality ( TEXT or AUDIO ) per session in the session configuration. Setting both results in a config error message. This means that you can configure the model to respond with either text or audio, but not both in the same session. Client authentication The Live API only provides server-to-server authentication by default. If you're implementing your Live API application using a client-to-server approach , you need to use ephemeral tokens to mitigate security risks. Session duration Audio-only sessions are limited to 15 minutes, and audio plus video sessions are limited to 2 minutes. However, you can configure different session management techniques for unlimited extensions on session duration. Context window A session has a context window limit of: 128k tokens for native audio output models 32k tokens for other Live API models Supported languages Live API supports the following languages. Note: Native audio output models automatically choose the appropriate language and don't support explicitly setting the language code. Language BCP-47 Code Language BCP-47 Code German (Germany) de-DE English (Australia)* en-AU English (UK)* en-GB English (India) en-IN English (US) en-US Spanish (US) es-US French (France) fr-FR Hindi (India) hi-IN Portuguese (Brazil) pt-BR Arabic (Generic) ar-XA Spanish (Spain)* es-ES French (Canada)* fr-CA Indonesian (Indonesia) id-ID Italian (Italy) it-IT Japanese (Japan) ja-JP Turkish (Turkey) tr-TR Vietnamese (Vietnam) vi-VN Bengali (India) bn-IN Gujarati (India)* gu-IN Kannada (India)* kn-IN Marathi (India) mr-IN Malayalam (India)* ml-IN Tamil (India) ta-IN Telugu (India) te-IN Dutch (Netherlands) nl-NL Korean (South Korea) ko-KR Mandarin Chinese (China)* cmn-CN Polish (Poland) pl-PL Russian (Russia) ru-RU Thai (Thailand) th-TH Languages marked with an asterisk (*) are not available for Native audio . What's next Read the Tool Use and Session Management guides for essential information on using the Live API effectively. Try the Live API in Google AI Studio . For more info about the Live API models, see Gemini 2.0 Flash Live and Gemini 2.5 Flash Native Audio on the Models page. Try more examples in the Live API cookbook , the Live API Tools cookbook , and the Live API Get Started script . Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-08 UTC. \ No newline at end of file diff --git a/text_content/docs_live-guide_c611966f.txt b/text_content/docs_live-guide_c611966f.txt new file mode 100644 index 0000000000000000000000000000000000000000..a10ace4af4700c8add93f4e1d5815ee8ba0bba6d --- /dev/null +++ b/text_content/docs_live-guide_c611966f.txt @@ -0,0 +1,5 @@ +URL: https://ai.google.dev/gemini-api/docs/live-guide#affective-dialog +Title: Live API capabilities guide  |  Gemini API  |  Google AI for Developers +================================================== + +Live API capabilities guide | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Live API capabilities guide Preview: The Live API is in preview. This is a comprehensive guide that covers capabilities and configurations available with the Live API. See Get started with Live API page for a overview and sample code for common use cases. Before you begin Familiarize yourself with core concepts: If you haven't already done so, read the Get started with Live API page first. This will introduce you to the fundamental principles of the Live API, how it works, and the distinction between the different models and their corresponding audio generation methods ( native audio or half-cascade). Try the Live API in AI Studio: You may find it useful to try the Live API in Google AI Studio before you start building. To use the Live API in Google AI Studio, select Stream . Establishing a connection The following example shows how to create a connection with an API key: Python import asyncio from google import genai client = genai . Client () model = "gemini-live-2.5-flash-preview" config = { "response_modalities" : [ "TEXT" ]} async def main (): async with client . aio . live . connect ( model = model , config = config ) as session : print ( "Session started" ) if __name__ == "__main__" : asyncio . run ( main ()) JavaScript import { GoogleGenAI , Modality } from '@google/genai' ; const ai = new GoogleGenAI ({}); const model = 'gemini-live-2.5-flash-preview' ; const config = { responseModalities : [ Modality . TEXT ] }; async function main () { const session = await ai . live . connect ({ model : model , callbacks : { onopen : function () { console . debug ( 'Opened' ); }, onmessage : function ( message ) { console . debug ( message ); }, onerror : function ( e ) { console . debug ( 'Error:' , e . message ); }, onclose : function ( e ) { console . debug ( 'Close:' , e . reason ); }, }, config : config , }); // Send content... session . close (); } main (); Note: You can only set one modality in the response_modalities field. This means that you can configure the model to respond with either text or audio, but not both in the same session. Interaction modalities The following sections provide examples and supporting context for the different input and output modalities available in Live API. Sending and receiving text Here's how you can send and receive text: Python import asyncio from google import genai client = genai . Client () model = "gemini-live-2.5-flash-preview" config = { "response_modalities" : [ "TEXT" ]} async def main (): async with client . aio . live . connect ( model = model , config = config ) as session : message = "Hello, how are you?" await session . send_client_content ( turns = { "role" : "user" , "parts" : [{ "text" : message }]}, turn_complete = True ) async for response in session . receive (): if response . text is not None : print ( response . text , end = "" ) if __name__ == "__main__" : asyncio . run ( main ()) JavaScript import { GoogleGenAI , Modality } from '@google/genai' ; const ai = new GoogleGenAI ({}); const model = 'gemini-live-2.5-flash-preview' ; const config = { responseModalities : [ Modality . TEXT ] }; async function live () { const responseQueue = []; async function waitMessage () { let done = false ; let message = undefined ; while ( ! done ) { message = responseQueue . shift (); if ( message ) { done = true ; } else { await new Promise (( resolve ) = > setTimeout ( resolve , 100 )); } } return message ; } async function handleTurn () { const turns = []; let done = false ; while ( ! done ) { const message = await waitMessage (); turns . push ( message ); if ( message . serverContent && message . serverContent . turnComplete ) { done = true ; } } return turns ; } const session = await ai . live . connect ({ model : model , callbacks : { onopen : function () { console . debug ( 'Opened' ); }, onmessage : function ( message ) { responseQueue . push ( message ); }, onerror : function ( e ) { console . debug ( 'Error:' , e . message ); }, onclose : function ( e ) { console . debug ( 'Close:' , e . reason ); }, }, config : config , }); const inputTurns = 'Hello how are you?' ; session . sendClientContent ({ turns : inputTurns }); const turns = await handleTurn (); for ( const turn of turns ) { if ( turn . text ) { console . debug ( 'Received text: %s\n' , turn . text ); } else if ( turn . data ) { console . debug ( 'Received inline data: %s\n' , turn . data ); } } session . close (); } async function main () { await live (). catch (( e ) = > console . error ( 'got error' , e )); } main (); Incremental content updates Use incremental updates to send text input, establish session context, or restore session context. For short contexts you can send turn-by-turn interactions to represent the exact sequence of events: Python turns = [ { "role" : "user" , "parts" : [{ "text" : "What is the capital of France?" }]}, { "role" : "model" , "parts" : [{ "text" : "Paris" }]}, ] await session . send_client_content ( turns = turns , turn_complete = False ) turns = [{ "role" : "user" , "parts" : [{ "text" : "What is the capital of Germany?" }]}] await session . send_client_content ( turns = turns , turn_complete = True ) JavaScript let inputTurns = [ { "role" : "user" , "parts" : [{ "text" : "What is the capital of France?" }] }, { "role" : "model" , "parts" : [{ "text" : "Paris" }] }, ] session . sendClientContent ({ turns : inputTurns , turnComplete : false }) inputTurns = [{ "role" : "user" , "parts" : [{ "text" : "What is the capital of Germany?" }] }] session . sendClientContent ({ turns : inputTurns , turnComplete : true }) For longer contexts it's recommended to provide a single message summary to free up the context window for subsequent interactions. See Session Resumption for another method for loading session context. Sending and receiving audio The most common audio example, audio-to-audio , is covered in the Getting started guide. Here's an audio-to-text example that reads a WAV file, sends it in the correct format and receives text output: Python # Test file: https://storage.googleapis.com/generativeai-downloads/data/16000.wav # Install helpers for converting files: pip install librosa soundfile import asyncio import io from pathlib import Path from google import genai from google.genai import types import soundfile as sf import librosa client = genai . Client () model = "gemini-live-2.5-flash-preview" config = { "response_modalities" : [ "TEXT" ]} async def main (): async with client . aio . live . connect ( model = model , config = config ) as session : buffer = io . BytesIO () y , sr = librosa . load ( "sample.wav" , sr = 16000 ) sf . write ( buffer , y , sr , format = 'RAW' , subtype = 'PCM_16' ) buffer . seek ( 0 ) audio_bytes = buffer . read () # If already in correct format, you can use this: # audio_bytes = Path("sample.pcm").read_bytes() await session . send_realtime_input ( audio = types . Blob ( data = audio_bytes , mime_type = "audio/pcm;rate=16000" ) ) async for response in session . receive (): if response . text is not None : print ( response . text ) if __name__ == "__main__" : asyncio . run ( main ()) JavaScript // Test file: https://storage.googleapis.com/generativeai-downloads/data/16000.wav // Install helpers for converting files: npm install wavefile import { GoogleGenAI , Modality } from '@google/genai' ; import * as fs from "node:fs" ; import pkg from 'wavefile' ; const { WaveFile } = pkg ; const ai = new GoogleGenAI ({}); const model = 'gemini-live-2.5-flash-preview' ; const config = { responseModalities : [ Modality . TEXT ] }; async function live () { const responseQueue = []; async function waitMessage () { let done = false ; let message = undefined ; while ( ! done ) { message = responseQueue . shift (); if ( message ) { done = true ; } else { await new Promise (( resolve ) = > setTimeout ( resolve , 100 )); } } return message ; } async function handleTurn () { const turns = []; let done = false ; while ( ! done ) { const message = await waitMessage (); turns . push ( message ); if ( message . serverContent && message . serverContent . turnComplete ) { done = true ; } } return turns ; } const session = await ai . live . connect ({ model : model , callbacks : { onopen : function () { console . debug ( 'Opened' ); }, onmessage : function ( message ) { responseQueue . push ( message ); }, onerror : function ( e ) { console . debug ( 'Error:' , e . message ); }, onclose : function ( e ) { console . debug ( 'Close:' , e . reason ); }, }, config : config , }); // Send Audio Chunk const fileBuffer = fs . readFileSync ( "sample.wav" ); // Ensure audio conforms to API requirements (16-bit PCM, 16kHz, mono) const wav = new WaveFile (); wav . fromBuffer ( fileBuffer ); wav . toSampleRate ( 16000 ); wav . toBitDepth ( "16" ); const base64Audio = wav . toBase64 (); // If already in correct format, you can use this: // const fileBuffer = fs.readFileSync("sample.pcm"); // const base64Audio = Buffer.from(fileBuffer).toString('base64'); session . sendRealtimeInput ( { audio : { data : base64Audio , mimeType : "audio/pcm;rate=16000" } } ); const turns = await handleTurn (); for ( const turn of turns ) { if ( turn . text ) { console . debug ( 'Received text: %s\n' , turn . text ); } else if ( turn . data ) { console . debug ( 'Received inline data: %s\n' , turn . data ); } } session . close (); } async function main () { await live (). catch (( e ) = > console . error ( 'got error' , e )); } main (); And here is a text-to-audio example. You can receive audio by setting AUDIO as response modality. This example saves the received data as WAV file: Python import asyncio import wave from google import genai client = genai . Client () model = "gemini-live-2.5-flash-preview" config = { "response_modalities" : [ "AUDIO" ]} async def main (): async with client . aio . live . connect ( model = model , config = config ) as session : wf = wave . open ( "audio.wav" , "wb" ) wf . setnchannels ( 1 ) wf . setsampwidth ( 2 ) wf . setframerate ( 24000 ) message = "Hello how are you?" await session . send_client_content ( turns = { "role" : "user" , "parts" : [{ "text" : message }]}, turn_complete = True ) async for response in session . receive (): if response . data is not None : wf . writeframes ( response . data ) # Un-comment this code to print audio data info # if response.server_content.model_turn is not None: # print(response.server_content.model_turn.parts[0].inline_data.mime_type) wf . close () if __name__ == "__main__" : asyncio . run ( main ()) JavaScript import { GoogleGenAI , Modality } from '@google/genai' ; import * as fs from "node:fs" ; import pkg from 'wavefile' ; const { WaveFile } = pkg ; const ai = new GoogleGenAI ({}); const model = 'gemini-live-2.5-flash-preview' ; const config = { responseModalities : [ Modality . AUDIO ] }; async function live () { const responseQueue = []; async function waitMessage () { let done = false ; let message = undefined ; while ( ! done ) { message = responseQueue . shift (); if ( message ) { done = true ; } else { await new Promise (( resolve ) = > setTimeout ( resolve , 100 )); } } return message ; } async function handleTurn () { const turns = []; let done = false ; while ( ! done ) { const message = await waitMessage (); turns . push ( message ); if ( message . serverContent && message . serverContent . turnComplete ) { done = true ; } } return turns ; } const session = await ai . live . connect ({ model : model , callbacks : { onopen : function () { console . debug ( 'Opened' ); }, onmessage : function ( message ) { responseQueue . push ( message ); }, onerror : function ( e ) { console . debug ( 'Error:' , e . message ); }, onclose : function ( e ) { console . debug ( 'Close:' , e . reason ); }, }, config : config , }); const inputTurns = 'Hello how are you?' ; session . sendClientContent ({ turns : inputTurns }); const turns = await handleTurn (); // Combine audio data strings and save as wave file const combinedAudio = turns . reduce (( acc , turn ) = > { if ( turn . data ) { const buffer = Buffer . from ( turn . data , 'base64' ); const intArray = new Int16Array ( buffer . buffer , buffer . byteOffset , buffer . byteLength / Int16Array . BYTES_PER_ELEMENT ); return acc . concat ( Array . from ( intArray )); } return acc ; }, []); const audioBuffer = new Int16Array ( combinedAudio ); const wf = new WaveFile (); wf . fromScratch ( 1 , 24000 , '16' , audioBuffer ); fs . writeFileSync ( 'output.wav' , wf . toBuffer ()); session . close (); } async function main () { await live (). catch (( e ) = > console . error ( 'got error' , e )); } main (); Audio formats Audio data in the Live API is always raw, little-endian, 16-bit PCM. Audio output always uses a sample rate of 24kHz. Input audio is natively 16kHz, but the Live API will resample if needed so any sample rate can be sent. To convey the sample rate of input audio, set the MIME type of each audio-containing Blob to a value like audio/pcm;rate=16000 . Audio transcriptions You can enable transcription of the model's audio output by sending output_audio_transcription in the setup config. The transcription language is inferred from the model's response. Python import asyncio from google import genai from google.genai import types client = genai . Client () model = "gemini-live-2.5-flash-preview" config = { "response_modalities" : [ "AUDIO" ], "output_audio_transcription" : {} } async def main (): async with client . aio . live . connect ( model = model , config = config ) as session : message = "Hello? Gemini are you there?" await session . send_client_content ( turns = { "role" : "user" , "parts" : [{ "text" : message }]}, turn_complete = True ) async for response in session . receive (): if response . server_content . model_turn : print ( "Model turn:" , response . server_content . model_turn ) if response . server_content . output_transcription : print ( "Transcript:" , response . server_content . output_transcription . text ) if __name__ == "__main__" : asyncio . run ( main ()) JavaScript import { GoogleGenAI , Modality } from '@google/genai' ; const ai = new GoogleGenAI ({}); const model = 'gemini-live-2.5-flash-preview' ; const config = { responseModalities : [ Modality . AUDIO ], outputAudioTranscription : {} }; async function live () { const responseQueue = []; async function waitMessage () { let done = false ; let message = undefined ; while ( ! done ) { message = responseQueue . shift (); if ( message ) { done = true ; } else { await new Promise (( resolve ) = > setTimeout ( resolve , 100 )); } } return message ; } async function handleTurn () { const turns = []; let done = false ; while ( ! done ) { const message = await waitMessage (); turns . push ( message ); if ( message . serverContent && message . serverContent . turnComplete ) { done = true ; } } return turns ; } const session = await ai . live . connect ({ model : model , callbacks : { onopen : function () { console . debug ( 'Opened' ); }, onmessage : function ( message ) { responseQueue . push ( message ); }, onerror : function ( e ) { console . debug ( 'Error:' , e . message ); }, onclose : function ( e ) { console . debug ( 'Close:' , e . reason ); }, }, config : config , }); const inputTurns = 'Hello how are you?' ; session . sendClientContent ({ turns : inputTurns }); const turns = await handleTurn (); for ( const turn of turns ) { if ( turn . serverContent && turn . serverContent . outputTranscription ) { console . debug ( 'Received output transcription: %s\n' , turn . serverContent . outputTranscription . text ); } } session . close (); } async function main () { await live (). catch (( e ) = > console . error ( 'got error' , e )); } main (); You can enable transcription of the audio input by sending input_audio_transcription in setup config. Python import asyncio from pathlib import Path from google import genai from google.genai import types client = genai . Client () model = "gemini-live-2.5-flash-preview" config = { "response_modalities" : [ "TEXT" ], "input_audio_transcription" : {}, } async def main (): async with client . aio . live . connect ( model = model , config = config ) as session : audio_data = Path ( "16000.pcm" ) . read_bytes () await session . send_realtime_input ( audio = types . Blob ( data = audio_data , mime_type = 'audio/pcm;rate=16000' ) ) async for msg in session . receive (): if msg . server_content . input_transcription : print ( 'Transcript:' , msg . server_content . input_transcription . text ) if __name__ == "__main__" : asyncio . run ( main ()) JavaScript import { GoogleGenAI , Modality } from '@google/genai' ; import * as fs from "node:fs" ; import pkg from 'wavefile' ; const { WaveFile } = pkg ; const ai = new GoogleGenAI ({}); const model = 'gemini-live-2.5-flash-preview' ; const config = { responseModalities : [ Modality . TEXT ], inputAudioTranscription : {} }; async function live () { const responseQueue = []; async function waitMessage () { let done = false ; let message = undefined ; while ( ! done ) { message = responseQueue . shift (); if ( message ) { done = true ; } else { await new Promise (( resolve ) = > setTimeout ( resolve , 100 )); } } return message ; } async function handleTurn () { const turns = []; let done = false ; while ( ! done ) { const message = await waitMessage (); turns . push ( message ); if ( message . serverContent && message . serverContent . turnComplete ) { done = true ; } } return turns ; } const session = await ai . live . connect ({ model : model , callbacks : { onopen : function () { console . debug ( 'Opened' ); }, onmessage : function ( message ) { responseQueue . push ( message ); }, onerror : function ( e ) { console . debug ( 'Error:' , e . message ); }, onclose : function ( e ) { console . debug ( 'Close:' , e . reason ); }, }, config : config , }); // Send Audio Chunk const fileBuffer = fs . readFileSync ( "16000.wav" ); // Ensure audio conforms to API requirements (16-bit PCM, 16kHz, mono) const wav = new WaveFile (); wav . fromBuffer ( fileBuffer ); wav . toSampleRate ( 16000 ); wav . toBitDepth ( "16" ); const base64Audio = wav . toBase64 (); // If already in correct format, you can use this: // const fileBuffer = fs.readFileSync("sample.pcm"); // const base64Audio = Buffer.from(fileBuffer).toString('base64'); session . sendRealtimeInput ( { audio : { data : base64Audio , mimeType : "audio/pcm;rate=16000" } } ); const turns = await handleTurn (); for ( const turn of turns ) { if ( turn . serverContent && turn . serverContent . outputTranscription ) { console . log ( "Transcription" ) console . log ( turn . serverContent . outputTranscription . text ); } } for ( const turn of turns ) { if ( turn . text ) { console . debug ( 'Received text: %s\n' , turn . text ); } else if ( turn . data ) { console . debug ( 'Received inline data: %s\n' , turn . data ); } else if ( turn . serverContent && turn . serverContent . inputTranscription ) { console . debug ( 'Received input transcription: %s\n' , turn . serverContent . inputTranscription . text ); } } session . close (); } async function main () { await live (). catch (( e ) = > console . error ( 'got error' , e )); } main (); Stream audio and video To see an example of how to use the Live API in a streaming audio and video format, run the "Live API - Get Started" file in the cookbooks repository: View on Colab Change voice and language The Live API models each support a different set of voices. Half-cascade supports Puck, Charon, Kore, Fenrir, Aoede, Leda, Orus, and Zephyr. Native audio supports a much longer list (identical to the TTS model list ). You can listen to all the voices in AI Studio . To specify a voice, set the voice name within the speechConfig object as part of the session configuration: Python config = { "response_modalities" : [ "AUDIO" ], "speech_config" : { "voice_config" : { "prebuilt_voice_config" : { "voice_name" : "Kore" }} }, } JavaScript const config = { responseModalities : [ Modality . AUDIO ], speechConfig : { voiceConfig : { prebuiltVoiceConfig : { voiceName : "Kore" } } } }; Note: If you're using the generateContent API, the set of available voices is slightly different. See the audio generation guide for generateContent audio generation voices. The Live API supports multiple languages . To change the language, set the language code within the speechConfig object as part of the session configuration: Python config = { "response_modalities" : [ "AUDIO" ], "speech_config" : { "language_code" : "de-DE" } } JavaScript const config = { responseModalities : [ Modality . AUDIO ], speechConfig : { languageCode : "de-DE" } }; Note: Native audio output models automatically choose the appropriate language and don't support explicitly setting the language code. Native audio capabilities The following capabilities are only available with native audio. You can learn more about native audio in Choose a model and audio generation . Note: Native audio models currently have limited tool use support. See Overview of supported tools for details. How to use native audio output To use native audio output, configure one of the native audio models and set response_modalities to AUDIO . See Send and receive audio for a full example. Python model = "gemini-2.5-flash-preview-native-audio-dialog" config = types . LiveConnectConfig ( response_modalities = [ "AUDIO" ]) async with client . aio . live . connect ( model = model , config = config ) as session : # Send audio input and receive audio JavaScript const model = 'gemini-2.5-flash-preview-native-audio-dialog' ; const config = { responseModalities : [ Modality . AUDIO ] }; async function main () { const session = await ai . live . connect ({ model : model , config : config , callbacks : ..., }); // Send audio input and receive audio session . close (); } main (); Affective dialog This feature lets Gemini adapt its response style to the input expression and tone. To use affective dialog, set the api version to v1alpha and set enable_affective_dialog to true in the setup message: Python client = genai . Client ( http_options = { "api_version" : "v1alpha" }) config = types . LiveConnectConfig ( response_modalities = [ "AUDIO" ], enable_affective_dialog = True ) JavaScript const ai = new GoogleGenAI ({ httpOptions : { "apiVersion" : "v1alpha" } }); const config = { responseModalities : [ Modality . AUDIO ], enableAffectiveDialog : true }; Note that affective dialog is currently only supported by the native audio output models. Proactive audio When this feature is enabled, Gemini can proactively decide not to respond if the content is not relevant. To use it, set the api version to v1alpha and configure the proactivity field in the setup message and set proactive_audio to true : Python client = genai . Client ( http_options = { "api_version" : "v1alpha" }) config = types . LiveConnectConfig ( response_modalities = [ "AUDIO" ], proactivity = { 'proactive_audio' : True } ) JavaScript const ai = new GoogleGenAI ({ httpOptions : { "apiVersion" : "v1alpha" } }); const config = { responseModalities : [ Modality . AUDIO ], proactivity : { proactiveAudio : true } } Note that proactive audio is currently only supported by the native audio output models. Native audio output with thinking Native audio output supports thinking capabilities , available via a separate model gemini-2.5-flash-exp-native-audio-thinking-dialog . See Send and receive audio for a full example. Python model = "gemini-2.5-flash-exp-native-audio-thinking-dialog" config = types . LiveConnectConfig ( response_modalities = [ "AUDIO" ]) async with client . aio . live . connect ( model = model , config = config ) as session : # Send audio input and receive audio JavaScript const model = 'gemini-2.5-flash-exp-native-audio-thinking-dialog' ; const config = { responseModalities : [ Modality . AUDIO ] }; async function main () { const session = await ai . live . connect ({ model : model , config : config , callbacks : ..., }); // Send audio input and receive audio session . close (); } main (); Voice Activity Detection (VAD) Voice Activity Detection (VAD) allows the model to recognize when a person is speaking. This is essential for creating natural conversations, as it allows a user to interrupt the model at any time. When VAD detects an interruption, the ongoing generation is canceled and discarded. Only the information already sent to the client is retained in the session history. The server then sends a BidiGenerateContentServerContent message to report the interruption. The Gemini server then discards any pending function calls and sends a BidiGenerateContentServerContent message with the IDs of the canceled calls. Python async for response in session . receive (): if response . server_content . interrupted is True : # The generation was interrupted # If realtime playback is implemented in your application, # you should stop playing audio and clear queued playback here. JavaScript const turns = await handleTurn (); for ( const turn of turns ) { if ( turn . serverContent && turn . serverContent . interrupted ) { // The generation was interrupted // If realtime playback is implemented in your application, // you should stop playing audio and clear queued playback here. } } Automatic VAD By default, the model automatically performs VAD on a continuous audio input stream. VAD can be configured with the realtimeInputConfig.automaticActivityDetection field of the setup configuration . When the audio stream is paused for more than a second (for example, because the user switched off the microphone), an audioStreamEnd event should be sent to flush any cached audio. The client can resume sending audio data at any time. Python # example audio file to try: # URL = "https://storage.googleapis.com/generativeai-downloads/data/hello_are_you_there.pcm" # !wget -q $URL -O sample.pcm import asyncio from pathlib import Path from google import genai from google.genai import types client = genai . Client () model = "gemini-live-2.5-flash-preview" config = { "response_modalities" : [ "TEXT" ]} async def main (): async with client . aio . live . connect ( model = model , config = config ) as session : audio_bytes = Path ( "sample.pcm" ) . read_bytes () await session . send_realtime_input ( audio = types . Blob ( data = audio_bytes , mime_type = "audio/pcm;rate=16000" ) ) # if stream gets paused, send: # await session.send_realtime_input(audio_stream_end=True) async for response in session . receive (): if response . text is not None : print ( response . text ) if __name__ == "__main__" : asyncio . run ( main ()) JavaScript // example audio file to try: // URL = "https://storage.googleapis.com/generativeai-downloads/data/hello_are_you_there.pcm" // !wget -q $URL -O sample.pcm import { GoogleGenAI , Modality } from '@google/genai' ; import * as fs from "node:fs" ; const ai = new GoogleGenAI ({}); const model = 'gemini-live-2.5-flash-preview' ; const config = { responseModalities : [ Modality . TEXT ] }; async function live () { const responseQueue = []; async function waitMessage () { let done = false ; let message = undefined ; while ( ! done ) { message = responseQueue . shift (); if ( message ) { done = true ; } else { await new Promise (( resolve ) = > setTimeout ( resolve , 100 )); } } return message ; } async function handleTurn () { const turns = []; let done = false ; while ( ! done ) { const message = await waitMessage (); turns . push ( message ); if ( message . serverContent && message . serverContent . turnComplete ) { done = true ; } } return turns ; } const session = await ai . live . connect ({ model : model , callbacks : { onopen : function () { console . debug ( 'Opened' ); }, onmessage : function ( message ) { responseQueue . push ( message ); }, onerror : function ( e ) { console . debug ( 'Error:' , e . message ); }, onclose : function ( e ) { console . debug ( 'Close:' , e . reason ); }, }, config : config , }); // Send Audio Chunk const fileBuffer = fs . readFileSync ( "sample.pcm" ); const base64Audio = Buffer . from ( fileBuffer ). toString ( 'base64' ); session . sendRealtimeInput ( { audio : { data : base64Audio , mimeType : "audio/pcm;rate=16000" } } ); // if stream gets paused, send: // session.sendRealtimeInput({ audioStreamEnd: true }) const turns = await handleTurn (); for ( const turn of turns ) { if ( turn . text ) { console . debug ( 'Received text: %s\n' , turn . text ); } else if ( turn . data ) { console . debug ( 'Received inline data: %s\n' , turn . data ); } } session . close (); } async function main () { await live (). catch (( e ) = > console . error ( 'got error' , e )); } main (); With send_realtime_input , the API will respond to audio automatically based on VAD. While send_client_content adds messages to the model context in order, send_realtime_input is optimized for responsiveness at the expense of deterministic ordering. Automatic VAD configuration For more control over the VAD activity, you can configure the following parameters. See API reference for more info. Python from google.genai import types config = { "response_modalities" : [ "TEXT" ], "realtime_input_config" : { "automatic_activity_detection" : { "disabled" : False , # default "start_of_speech_sensitivity" : types . StartSensitivity . START_SENSITIVITY_LOW , "end_of_speech_sensitivity" : types . EndSensitivity . END_SENSITIVITY_LOW , "prefix_padding_ms" : 20 , "silence_duration_ms" : 100 , } } } JavaScript import { GoogleGenAI , Modality , StartSensitivity , EndSensitivity } from '@google/genai' ; const config = { responseModalities : [ Modality . TEXT ], realtimeInputConfig : { automaticActivityDetection : { disabled : false , // default startOfSpeechSensitivity : StartSensitivity . START_SENSITIVITY_LOW , endOfSpeechSensitivity : EndSensitivity . END_SENSITIVITY_LOW , prefixPaddingMs : 20 , silenceDurationMs : 100 , } } }; Disable automatic VAD Alternatively, the automatic VAD can be disabled by setting realtimeInputConfig.automaticActivityDetection.disabled to true in the setup message. In this configuration the client is responsible for detecting user speech and sending activityStart and activityEnd messages at the appropriate times. An audioStreamEnd isn't sent in this configuration. Instead, any interruption of the stream is marked by an activityEnd message. Python config = { "response_modalities" : [ "TEXT" ], "realtime_input_config" : { "automatic_activity_detection" : { "disabled" : True }}, } async with client . aio . live . connect ( model = model , config = config ) as session : # ... await session . send_realtime_input ( activity_start = types . ActivityStart ()) await session . send_realtime_input ( audio = types . Blob ( data = audio_bytes , mime_type = "audio/pcm;rate=16000" ) ) await session . send_realtime_input ( activity_end = types . ActivityEnd ()) # ... JavaScript const config = { responseModalities : [ Modality . TEXT ], realtimeInputConfig : { automaticActivityDetection : { disabled : true , } } }; session . sendRealtimeInput ({ activityStart : {} }) session . sendRealtimeInput ( { audio : { data : base64Audio , mimeType : "audio/pcm;rate=16000" } } ); session . sendRealtimeInput ({ activityEnd : {} }) Token count You can find the total number of consumed tokens in the usageMetadata field of the returned server message. Python async for message in session . receive (): # The server will periodically send messages that include UsageMetadata. if message . usage_metadata : usage = message . usage_metadata print ( f "Used { usage . total_token_count } tokens in total. Response token breakdown:" ) for detail in usage . response_tokens_details : match detail : case types . ModalityTokenCount ( modality = modality , token_count = count ): print ( f " { modality } : { count } " ) JavaScript const turns = await handleTurn (); for ( const turn of turns ) { if ( turn . usageMetadata ) { console . debug ( 'Used %s tokens in total. Response token breakdown:\n' , turn . usageMetadata . totalTokenCount ); for ( const detail of turn . usageMetadata . responseTokensDetails ) { console . debug ( '%s\n' , detail ); } } } Media resolution You can specify the media resolution for the input media by setting the mediaResolution field as part of the session configuration: Python from google.genai import types config = { "response_modalities" : [ "AUDIO" ], "media_resolution" : types . MediaResolution . MEDIA_RESOLUTION_LOW , } JavaScript import { GoogleGenAI , Modality , MediaResolution } from '@google/genai' ; const config = { responseModalities : [ Modality . TEXT ], mediaResolution : MediaResolution . MEDIA_RESOLUTION_LOW , }; Limitations Consider the following limitations of the Live API when you plan your project. Response modalities You can only set one response modality ( TEXT or AUDIO ) per session in the session configuration. Setting both results in a config error message. This means that you can configure the model to respond with either text or audio, but not both in the same session. Client authentication The Live API only provides server-to-server authentication by default. If you're implementing your Live API application using a client-to-server approach , you need to use ephemeral tokens to mitigate security risks. Session duration Audio-only sessions are limited to 15 minutes, and audio plus video sessions are limited to 2 minutes. However, you can configure different session management techniques for unlimited extensions on session duration. Context window A session has a context window limit of: 128k tokens for native audio output models 32k tokens for other Live API models Supported languages Live API supports the following languages. Note: Native audio output models automatically choose the appropriate language and don't support explicitly setting the language code. Language BCP-47 Code Language BCP-47 Code German (Germany) de-DE English (Australia)* en-AU English (UK)* en-GB English (India) en-IN English (US) en-US Spanish (US) es-US French (France) fr-FR Hindi (India) hi-IN Portuguese (Brazil) pt-BR Arabic (Generic) ar-XA Spanish (Spain)* es-ES French (Canada)* fr-CA Indonesian (Indonesia) id-ID Italian (Italy) it-IT Japanese (Japan) ja-JP Turkish (Turkey) tr-TR Vietnamese (Vietnam) vi-VN Bengali (India) bn-IN Gujarati (India)* gu-IN Kannada (India)* kn-IN Marathi (India) mr-IN Malayalam (India)* ml-IN Tamil (India) ta-IN Telugu (India) te-IN Dutch (Netherlands) nl-NL Korean (South Korea) ko-KR Mandarin Chinese (China)* cmn-CN Polish (Poland) pl-PL Russian (Russia) ru-RU Thai (Thailand) th-TH Languages marked with an asterisk (*) are not available for Native audio . What's next Read the Tool Use and Session Management guides for essential information on using the Live API effectively. Try the Live API in Google AI Studio . For more info about the Live API models, see Gemini 2.0 Flash Live and Gemini 2.5 Flash Native Audio on the Models page. Try more examples in the Live API cookbook , the Live API Tools cookbook , and the Live API Get Started script . Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-08 UTC. \ No newline at end of file diff --git a/text_content/docs_live-session_10ac610a.txt b/text_content/docs_live-session_10ac610a.txt new file mode 100644 index 0000000000000000000000000000000000000000..7bb0a57793bc456ec749a6c2659403c3c9bfbe19 --- /dev/null +++ b/text_content/docs_live-session_10ac610a.txt @@ -0,0 +1,5 @@ +URL: https://ai.google.dev/gemini-api/docs/live-session +Title: Session management with Live API  |  Gemini API  |  Google AI for Developers +================================================== + +Session management with Live API | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Session management with Live API In the Live API, a session refers to a persistent connection where input and output are streamed continuously over the same connection (read more about how it works ). This unique session design enables low latency and supports unique features, but can also introduce challenges, like session time limits, and early termination. This guide covers strategies for overcoming the session management challenges that can arise when using the Live API. Session lifetime Without compression, audio-only sessions are limited to 15 minutes, and audio-video sessions are limited to 2 minutes. Exceeding these limits will terminate the session (and therefore, the connection), but you can use context window compression to extend sessions to an unlimited amount of time. The lifetime of a connection is limited as well, to around 10 minutes. When the connection terminates, the session terminates as well. In this case, you can configure a single session to stay active over multiple connections using session resumption . You'll also receive a GoAway message before the connection ends, allowing you to take further actions. Context window compression To enable longer sessions, and avoid abrupt connection termination, you can enable context window compression by setting the contextWindowCompression field as part of the session configuration. In the ContextWindowCompressionConfig , you can configure a sliding-window mechanism and the number of tokens that triggers compression. Python from google.genai import types config = types . LiveConnectConfig ( response_modalities = [ "AUDIO" ], context_window_compression = ( # Configures compression with default parameters. types . ContextWindowCompressionConfig ( sliding_window = types . SlidingWindow (), ) ), ) JavaScript const config = { responseModalities : [ Modality . AUDIO ], contextWindowCompression : { slidingWindow : {} } }; Session resumption To prevent session termination when the server periodically resets the WebSocket connection, configure the sessionResumption field within the setup configuration . Passing this configuration causes the server to send SessionResumptionUpdate messages, which can be used to resume the session by passing the last resumption token as the SessionResumptionConfig.handle of the subsequent connection. Python import asyncio from google import genai from google.genai import types client = genai . Client () model = "gemini-live-2.5-flash-preview" async def main (): print ( f "Connecting to the service with handle { previous_session_handle } ..." ) async with client . aio . live . connect ( model = model , config = types . LiveConnectConfig ( response_modalities = [ "AUDIO" ], session_resumption = types . SessionResumptionConfig ( # The handle of the session to resume is passed here, # or else None to start a new session. handle = previous_session_handle ), ), ) as session : while True : await session . send_client_content ( turns = types . Content ( role = "user" , parts = [ types . Part ( text = "Hello world!" )] ) ) async for message in session . receive (): # Periodically, the server will send update messages that may # contain a handle for the current state of the session. if message . session_resumption_update : update = message . session_resumption_update if update . resumable and update . new_handle : # The handle should be retained and linked to the session. return update . new_handle # For the purposes of this example, placeholder input is continually fed # to the model. In non-sample code, the model inputs would come from # the user. if message . server_content and message . server_content . turn_complete : break if __name__ == "__main__" : asyncio . run ( main ()) JavaScript import { GoogleGenAI , Modality } from '@google/genai' ; const ai = new GoogleGenAI ({}); const model = 'gemini-live-2.5-flash-preview' ; async function live () { const responseQueue = []; async function waitMessage () { let done = false ; let message = undefined ; while ( ! done ) { message = responseQueue . shift (); if ( message ) { done = true ; } else { await new Promise (( resolve ) = > setTimeout ( resolve , 100 )); } } return message ; } async function handleTurn () { const turns = []; let done = false ; while ( ! done ) { const message = await waitMessage (); turns . push ( message ); if ( message . serverContent && message . serverContent . turnComplete ) { done = true ; } } return turns ; } console . debug ( 'Connecting to the service with handle %s...' , previousSessionHandle ) const session = await ai . live . connect ({ model : model , callbacks : { onopen : function () { console . debug ( 'Opened' ); }, onmessage : function ( message ) { responseQueue . push ( message ); }, onerror : function ( e ) { console . debug ( 'Error:' , e . message ); }, onclose : function ( e ) { console . debug ( 'Close:' , e . reason ); }, }, config : { responseModalities : [ Modality . TEXT ], sessionResumption : { handle : previousSessionHandle } // The handle of the session to resume is passed here, or else null to start a new session. } }); const inputTurns = 'Hello how are you?' ; session . sendClientContent ({ turns : inputTurns }); const turns = await handleTurn (); for ( const turn of turns ) { if ( turn . sessionResumptionUpdate ) { if ( turn . sessionResumptionUpdate . resumable && turn . sessionResumptionUpdate . newHandle ) { let newHandle = turn . sessionResumptionUpdate . newHandle // ...Store newHandle and start new session with this handle here } } } session . close (); } async function main () { await live (). catch (( e ) = > console . error ( 'got error' , e )); } main (); Receiving a message before the session disconnects The server sends a GoAway message that signals that the current connection will soon be terminated. This message includes the timeLeft , indicating the remaining time and lets you take further action before the connection will be terminated as ABORTED. Python async for response in session . receive (): if response . go_away is not None : # The connection will soon be terminated print ( response . go_away . time_left ) JavaScript const turns = await handleTurn (); for ( const turn of turns ) { if ( turn . goAway ) { console . debug ( 'Time left: %s\n' , turn . goAway . timeLeft ); } } Receiving a message when the generation is complete The server sends a generationComplete message that signals that the model finished generating the response. Python async for response in session . receive (): if response . server_content . generation_complete is True : # The generation is complete JavaScript const turns = await handleTurn (); for ( const turn of turns ) { if ( turn . serverContent && turn . serverContent . generationComplete ) { // The generation is complete } } What's next Explore more ways to work with the Live API in the full Capabilities guide, the Tool use page, or the Live API cookbook . Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-08 UTC. \ No newline at end of file diff --git a/text_content/docs_live-tools_93e13cac.txt b/text_content/docs_live-tools_93e13cac.txt new file mode 100644 index 0000000000000000000000000000000000000000..03d2c0c8478388d8089cd9598377cdec47c54b90 --- /dev/null +++ b/text_content/docs_live-tools_93e13cac.txt @@ -0,0 +1,5 @@ +URL: https://ai.google.dev/gemini-api/docs/live-tools#tools-overview +Title: Tool use with Live API  |  Gemini API  |  Google AI for Developers +================================================== + +Tool use with Live API | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Tool use with Live API Tool use allows Live API to go beyond just conversation by enabling it to perform actions in the real-world and pull in external context while maintaining a real time connection. You can define tools such as Function calling , Code execution , and Google Search with the Live API. Overview of supported tools Here's a brief overview of the available tools for each model: Tool Cascaded models gemini-live-2.5-flash-preview gemini-2.0-flash-live-001 gemini-2.5-flash-preview-native-audio-dialog gemini-2.5-flash-exp-native-audio-thinking-dialog Search Yes Yes Yes Function calling Yes Yes No Code execution Yes No No Url context Yes No No Function calling Live API supports function calling, just like regular content generation requests. Function calling lets the Live API interact with external data and programs, greatly increasing what your applications can accomplish. You can define function declarations as part of the session configuration. After receiving tool calls, the client should respond with a list of FunctionResponse objects using the session.send_tool_response method. See the Function calling tutorial to learn more. Note: Unlike the generateContent API, the Live API doesn't support automatic tool response handling. You must handle tool responses manually in your client code. Python import asyncio from google import genai from google.genai import types client = genai . Client () model = "gemini-live-2.5-flash-preview" # Simple function definitions turn_on_the_lights = { "name" : "turn_on_the_lights" } turn_off_the_lights = { "name" : "turn_off_the_lights" } tools = [{ "function_declarations" : [ turn_on_the_lights , turn_off_the_lights ]}] config = { "response_modalities" : [ "TEXT" ], "tools" : tools } async def main (): async with client . aio . live . connect ( model = model , config = config ) as session : prompt = "Turn on the lights please" await session . send_client_content ( turns = { "parts" : [{ "text" : prompt }]}) async for chunk in session . receive (): if chunk . server_content : if chunk . text is not None : print ( chunk . text ) elif chunk . tool_call : function_responses = [] for fc in chunk . tool_call . function_calls : function_response = types . FunctionResponse ( id = fc . id , name = fc . name , response = { "result" : "ok" } # simple, hard-coded function response ) function_responses . append ( function_response ) await session . send_tool_response ( function_responses = function_responses ) if __name__ == "__main__" : asyncio . run ( main ()) JavaScript import { GoogleGenAI , Modality } from '@google/genai' ; const ai = new GoogleGenAI ({}); const model = 'gemini-live-2.5-flash-preview' ; // Simple function definitions const turn_on_the_lights = { name : "turn_on_the_lights" } // , description: '...', parameters: { ... } const turn_off_the_lights = { name : "turn_off_the_lights" } const tools = [{ functionDeclarations : [ turn_on_the_lights , turn_off_the_lights ] }] const config = { responseModalities : [ Modality . TEXT ], tools : tools } async function live () { const responseQueue = []; async function waitMessage () { let done = false ; let message = undefined ; while ( ! done ) { message = responseQueue . shift (); if ( message ) { done = true ; } else { await new Promise (( resolve ) = > setTimeout ( resolve , 100 )); } } return message ; } async function handleTurn () { const turns = []; let done = false ; while ( ! done ) { const message = await waitMessage (); turns . push ( message ); if ( message . serverContent && message . serverContent . turnComplete ) { done = true ; } else if ( message . toolCall ) { done = true ; } } return turns ; } const session = await ai . live . connect ({ model : model , callbacks : { onopen : function () { console . debug ( 'Opened' ); }, onmessage : function ( message ) { responseQueue . push ( message ); }, onerror : function ( e ) { console . debug ( 'Error:' , e . message ); }, onclose : function ( e ) { console . debug ( 'Close:' , e . reason ); }, }, config : config , }); const inputTurns = 'Turn on the lights please' ; session . sendClientContent ({ turns : inputTurns }); let turns = await handleTurn (); for ( const turn of turns ) { if ( turn . serverContent && turn . serverContent . modelTurn && turn . serverContent . modelTurn . parts ) { for ( const part of turn . serverContent . modelTurn . parts ) { if ( part . text ) { console . debug ( 'Received text: %s\n' , part . text ); } } } else if ( turn . toolCall ) { const functionResponses = []; for ( const fc of turn . toolCall . functionCalls ) { functionResponses . push ({ id : fc . id , name : fc . name , response : { result : "ok" } // simple, hard-coded function response }); } console . debug ( 'Sending tool response...\n' ); session . sendToolResponse ({ functionResponses : functionResponses }); } } // Check again for new messages turns = await handleTurn (); for ( const turn of turns ) { if ( turn . serverContent && turn . serverContent . modelTurn && turn . serverContent . modelTurn . parts ) { for ( const part of turn . serverContent . modelTurn . parts ) { if ( part . text ) { console . debug ( 'Received text: %s\n' , part . text ); } } } } session . close (); } async function main () { await live (). catch (( e ) = > console . error ( 'got error' , e )); } main (); From a single prompt, the model can generate multiple function calls and the code necessary to chain their outputs. This code executes in a sandbox environment, generating subsequent BidiGenerateContentToolCall messages. Asynchronous function calling Note: Asynchronous function calling is only supported in half-cascade audio generation. Function calling executes sequentially by default, meaning execution pauses until the results of each function call are available. This ensures sequential processing, which means you won't be able to continue interacting with the model while the functions are being run. If you don't want to block the conversation, you can tell the model to run the functions asynchronously. To do so, you first need to add a behavior to the function definitions: Python # Non-blocking function definitions turn_on_the_lights = { "name" : "turn_on_the_lights" , "behavior" : "NON_BLOCKING" } # turn_on_the_lights will run asynchronously turn_off_the_lights = { "name" : "turn_off_the_lights" } # turn_off_the_lights will still pause all interactions with the model JavaScript import { GoogleGenAI , Modality , Behavior } from '@google/genai' ; // Non-blocking function definitions const turn_on_the_lights = { name : "turn_on_the_lights" , behavior : Behavior . NON_BLOCKING } // Blocking function definitions const turn_off_the_lights = { name : "turn_off_the_lights" } const tools = [{ functionDeclarations : [ turn_on_the_lights , turn_off_the_lights ] }] NON-BLOCKING ensures the function runs asynchronously while you can continue interacting with the model. Then you need to tell the model how to behave when it receives the FunctionResponse using the scheduling parameter. It can either: Interrupt what it's doing and tell you about the response it got right away ( scheduling="INTERRUPT" ), Wait until it's finished with what it's currently doing ( scheduling="WHEN_IDLE" ), Or do nothing and use that knowledge later on in the discussion ( scheduling="SILENT" ) Python # for a non-blocking function definition, apply scheduling in the function response: function_response = types . FunctionResponse ( id = fc . id , name = fc . name , response = { "result" : "ok" , "scheduling" : "INTERRUPT" # Can also be WHEN_IDLE or SILENT } ) JavaScript import { GoogleGenAI , Modality , Behavior , FunctionResponseScheduling } from '@google/genai' ; // for a non-blocking function definition, apply scheduling in the function response: const functionResponse = { id : fc . id , name : fc . name , response : { result : "ok" , scheduling : FunctionResponseScheduling . INTERRUPT // Can also be WHEN_IDLE or SILENT } } Code execution You can define code execution as part of the session configuration. This lets the Live API generate and execute Python code and dynamically perform computations to benefit your results. See the Code execution tutorial to learn more. Python import asyncio from google import genai from google.genai import types client = genai . Client () model = "gemini-live-2.5-flash-preview" tools = [{ 'code_execution' : {}}] config = { "response_modalities" : [ "TEXT" ], "tools" : tools } async def main (): async with client . aio . live . connect ( model = model , config = config ) as session : prompt = "Compute the largest prime palindrome under 100000." await session . send_client_content ( turns = { "parts" : [{ "text" : prompt }]}) async for chunk in session . receive (): if chunk . server_content : if chunk . text is not None : print ( chunk . text ) model_turn = chunk . server_content . model_turn if model_turn : for part in model_turn . parts : if part . executable_code is not None : print ( part . executable_code . code ) if part . code_execution_result is not None : print ( part . code_execution_result . output ) if __name__ == "__main__" : asyncio . run ( main ()) JavaScript import { GoogleGenAI , Modality } from '@google/genai' ; const ai = new GoogleGenAI ({}); const model = 'gemini-live-2.5-flash-preview' ; const tools = [{ codeExecution : {}}] const config = { responseModalities : [ Modality . TEXT ], tools : tools } async function live () { const responseQueue = []; async function waitMessage () { let done = false ; let message = undefined ; while ( ! done ) { message = responseQueue . shift (); if ( message ) { done = true ; } else { await new Promise (( resolve ) = > setTimeout ( resolve , 100 )); } } return message ; } async function handleTurn () { const turns = []; let done = false ; while ( ! done ) { const message = await waitMessage (); turns . push ( message ); if ( message . serverContent && message . serverContent . turnComplete ) { done = true ; } else if ( message . toolCall ) { done = true ; } } return turns ; } const session = await ai . live . connect ({ model : model , callbacks : { onopen : function () { console . debug ( 'Opened' ); }, onmessage : function ( message ) { responseQueue . push ( message ); }, onerror : function ( e ) { console . debug ( 'Error:' , e . message ); }, onclose : function ( e ) { console . debug ( 'Close:' , e . reason ); }, }, config : config , }); const inputTurns = 'Compute the largest prime palindrome under 100000.' ; session . sendClientContent ({ turns : inputTurns }); const turns = await handleTurn (); for ( const turn of turns ) { if ( turn . serverContent && turn . serverContent . modelTurn && turn . serverContent . modelTurn . parts ) { for ( const part of turn . serverContent . modelTurn . parts ) { if ( part . text ) { console . debug ( 'Received text: %s\n' , part . text ); } else if ( part . executableCode ) { console . debug ( 'executableCode: %s\n' , part . executableCode . code ); } else if ( part . codeExecutionResult ) { console . debug ( 'codeExecutionResult: %s\n' , part . codeExecutionResult . output ); } } } } session . close (); } async function main () { await live (). catch (( e ) = > console . error ( 'got error' , e )); } main (); Grounding with Google Search You can enable Grounding with Google Search as part of the session configuration. This increases the Live API's accuracy and prevents hallucinations. See the Grounding tutorial to learn more. Python import asyncio from google import genai from google.genai import types client = genai . Client () model = "gemini-live-2.5-flash-preview" tools = [{ 'google_search' : {}}] config = { "response_modalities" : [ "TEXT" ], "tools" : tools } async def main (): async with client . aio . live . connect ( model = model , config = config ) as session : prompt = "When did the last Brazil vs. Argentina soccer match happen?" await session . send_client_content ( turns = { "parts" : [{ "text" : prompt }]}) async for chunk in session . receive (): if chunk . server_content : if chunk . text is not None : print ( chunk . text ) # The model might generate and execute Python code to use Search model_turn = chunk . server_content . model_turn if model_turn : for part in model_turn . parts : if part . executable_code is not None : print ( part . executable_code . code ) if part . code_execution_result is not None : print ( part . code_execution_result . output ) if __name__ == "__main__" : asyncio . run ( main ()) JavaScript import { GoogleGenAI , Modality } from '@google/genai' ; const ai = new GoogleGenAI ({}); const model = 'gemini-live-2.5-flash-preview' ; const tools = [{ googleSearch : {}}] const config = { responseModalities : [ Modality . TEXT ], tools : tools } async function live () { const responseQueue = []; async function waitMessage () { let done = false ; let message = undefined ; while ( ! done ) { message = responseQueue . shift (); if ( message ) { done = true ; } else { await new Promise (( resolve ) = > setTimeout ( resolve , 100 )); } } return message ; } async function handleTurn () { const turns = []; let done = false ; while ( ! done ) { const message = await waitMessage (); turns . push ( message ); if ( message . serverContent && message . serverContent . turnComplete ) { done = true ; } else if ( message . toolCall ) { done = true ; } } return turns ; } const session = await ai . live . connect ({ model : model , callbacks : { onopen : function () { console . debug ( 'Opened' ); }, onmessage : function ( message ) { responseQueue . push ( message ); }, onerror : function ( e ) { console . debug ( 'Error:' , e . message ); }, onclose : function ( e ) { console . debug ( 'Close:' , e . reason ); }, }, config : config , }); const inputTurns = 'When did the last Brazil vs. Argentina soccer match happen?' ; session . sendClientContent ({ turns : inputTurns }); const turns = await handleTurn (); for ( const turn of turns ) { if ( turn . serverContent && turn . serverContent . modelTurn && turn . serverContent . modelTurn . parts ) { for ( const part of turn . serverContent . modelTurn . parts ) { if ( part . text ) { console . debug ( 'Received text: %s\n' , part . text ); } else if ( part . executableCode ) { console . debug ( 'executableCode: %s\n' , part . executableCode . code ); } else if ( part . codeExecutionResult ) { console . debug ( 'codeExecutionResult: %s\n' , part . codeExecutionResult . output ); } } } } session . close (); } async function main () { await live (). catch (( e ) = > console . error ( 'got error' , e )); } main (); Combining multiple tools You can combine multiple tools within the Live API, increasing your application's capabilities even more: Python prompt = """ Hey, I need you to do three things for me. 1. Compute the largest prime palindrome under 100000. 2. Then use Google Search to look up information about the largest earthquake in California the week of Dec 5 2024? 3. Turn on the lights Thanks! """ tools = [ { "google_search" : {}}, { "code_execution" : {}}, { "function_declarations" : [ turn_on_the_lights , turn_off_the_lights ]}, ] config = { "response_modalities" : [ "TEXT" ], "tools" : tools } # ... remaining model call JavaScript const prompt = `Hey, I need you to do three things for me. 1. Compute the largest prime palindrome under 100000. 2. Then use Google Search to look up information about the largest earthquake in California the week of Dec 5 2024? 3. Turn on the lights Thanks! ` const tools = [ { googleSearch : {} }, { codeExecution : {} }, { functionDeclarations : [ turn_on_the_lights , turn_off_the_lights ] } ] const config = { responseModalities : [ Modality . TEXT ], tools : tools } // ... remaining model call What's next Check out more examples of using tools with the Live API in the Tool use cookbook . Get the full story on features and configurations from the Live API Capabilities guide . Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-08 UTC. \ No newline at end of file diff --git a/text_content/docs_live_8a8e6385.txt b/text_content/docs_live_8a8e6385.txt new file mode 100644 index 0000000000000000000000000000000000000000..3714c08326319ca22397484f0b4a01cf918e2364 --- /dev/null +++ b/text_content/docs_live_8a8e6385.txt @@ -0,0 +1,5 @@ +URL: https://ai.google.dev/gemini-api/docs/live#audio-generation +Title: Get started with Live API  |  Gemini API  |  Google AI for Developers +================================================== + +Get started with Live API | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Get started with Live API Preview: The Live API is in preview. The Live API enables low-latency, real-time voice and video interactions with Gemini. It processes continuous streams of audio, video, or text to deliver immediate, human-like spoken responses, creating a natural conversational experience for your users. Live API offers a comprehensive set of features such as Voice Activity Detection , tool use and function calling , session management (for managing long running conversations) and ephemeral tokens (for secure client-sided authentication). This page gets you up and running with examples and basic code samples. Example applications Check out the following example applications that illustrate how to use Live API for end-to-end use cases: Live audio starter app on AI Studio, using JavaScript libraries to connect to Live API and stream bidirectional audio through your microphone and speakers. Live API Python cookbook using Pyaudio that connects to Live API. Partner integrations If you prefer a simpler development process, you can use Daily or LiveKit . These are third-party partner platforms that have already integrated the Gemini Live API over the WebRTC protocol to streamline the development of real-time audio and video applications. Before you begin building There are two important decisions to make before you begin building with the Live API: choosing a model and choosing an implementation approach. Choose an audio generation architecture If you're building an audio-based use case, your choice of model determines the audio generation architecture used to create the audio response: Native audio : This option provides the most natural and realistic-sounding speech and better multilingual performance. It also enables advanced features like affective (emotion-aware) dialogue , proactive audio (where the model can decide to ignore or respond to certain inputs), and "thinking" . Native audio is supported by the following native audio models : gemini-2.5-flash-preview-native-audio-dialog gemini-2.5-flash-exp-native-audio-thinking-dialog Half-cascade audio : This option uses a cascaded model architecture (native audio input and text-to-speech output). It offers better performance and reliability in production environments, especially with tool use . Half-cascaded audio is supported by the following models: gemini-live-2.5-flash-preview gemini-2.0-flash-live-001 Choose an implementation approach When integrating with Live API, you'll need to choose one of the following implementation approaches: Server-to-server : Your backend connects to the Live API using WebSockets . Typically, your client sends stream data (audio, video, text) to your server, which then forwards it to the Live API. Client-to-server : Your frontend code connects directly to the Live API using WebSockets to stream data, bypassing your backend. Note: Client-to-server generally offers better performance for streaming audio and video, since it bypasses the need to send the stream to your backend first. It's also easier to set up since you don't need to implement a proxy that sends data from your client to your server and then your server to the API. However, for production environments, in order to mitigate security risks, we recommend using ephemeral tokens instead of standard API keys. Get started This example reads a WAV file , sends it in the correct format, and saves the received data as WAV file. You can send audio by converting it to 16-bit PCM, 16kHz, mono format, and you can receive audio by setting AUDIO as response modality. The output uses a sample rate of 24kHz. Python # Test file: https://storage.googleapis.com/generativeai-downloads/data/16000.wav # Install helpers for converting files: pip install librosa soundfile import asyncio import io from pathlib import Path import wave from google import genai from google.genai import types import soundfile as sf import librosa client = genai . Client () # Half cascade model: # model = "gemini-live-2.5-flash-preview" # Native audio output model: model = "gemini-2.5-flash-preview-native-audio-dialog" config = { "response_modalities" : [ "AUDIO" ], "system_instruction" : "You are a helpful assistant and answer in a friendly tone." , } async def main (): async with client . aio . live . connect ( model = model , config = config ) as session : buffer = io . BytesIO () y , sr = librosa . load ( "sample.wav" , sr = 16000 ) sf . write ( buffer , y , sr , format = 'RAW' , subtype = 'PCM_16' ) buffer . seek ( 0 ) audio_bytes = buffer . read () # If already in correct format, you can use this: # audio_bytes = Path("sample.pcm").read_bytes() await session . send_realtime_input ( audio = types . Blob ( data = audio_bytes , mime_type = "audio/pcm;rate=16000" ) ) wf = wave . open ( "audio.wav" , "wb" ) wf . setnchannels ( 1 ) wf . setsampwidth ( 2 ) wf . setframerate ( 24000 ) # Output is 24kHz async for response in session . receive (): if response . data is not None : wf . writeframes ( response . data ) # Un-comment this code to print audio data info # if response.server_content.model_turn is not None: # print(response.server_content.model_turn.parts[0].inline_data.mime_type) wf . close () if __name__ == "__main__" : asyncio . run ( main ()) JavaScript // Test file: https://storage.googleapis.com/generativeai-downloads/data/16000.wav import { GoogleGenAI , Modality } from '@google/genai' ; import * as fs from "node:fs" ; import pkg from 'wavefile' ; // npm install wavefile const { WaveFile } = pkg ; const ai = new GoogleGenAI ({}); // WARNING: Do not use API keys in client-side (browser based) applications // Consider using Ephemeral Tokens instead // More information at: https://ai.google.dev/gemini-api/docs/ephemeral-tokens // Half cascade model: // const model = "gemini-live-2.5-flash-preview" // Native audio output model: const model = "gemini-2.5-flash-preview-native-audio-dialog" const config = { responseModalities : [ Modality . AUDIO ], systemInstruction : "You are a helpful assistant and answer in a friendly tone." }; async function live () { const responseQueue = []; async function waitMessage () { let done = false ; let message = undefined ; while ( ! done ) { message = responseQueue . shift (); if ( message ) { done = true ; } else { await new Promise (( resolve ) = > setTimeout ( resolve , 100 )); } } return message ; } async function handleTurn () { const turns = []; let done = false ; while ( ! done ) { const message = await waitMessage (); turns . push ( message ); if ( message . serverContent && message . serverContent . turnComplete ) { done = true ; } } return turns ; } const session = await ai . live . connect ({ model : model , callbacks : { onopen : function () { console . debug ( 'Opened' ); }, onmessage : function ( message ) { responseQueue . push ( message ); }, onerror : function ( e ) { console . debug ( 'Error:' , e . message ); }, onclose : function ( e ) { console . debug ( 'Close:' , e . reason ); }, }, config : config , }); // Send Audio Chunk const fileBuffer = fs . readFileSync ( "sample.wav" ); // Ensure audio conforms to API requirements (16-bit PCM, 16kHz, mono) const wav = new WaveFile (); wav . fromBuffer ( fileBuffer ); wav . toSampleRate ( 16000 ); wav . toBitDepth ( "16" ); const base64Audio = wav . toBase64 (); // If already in correct format, you can use this: // const fileBuffer = fs.readFileSync("sample.pcm"); // const base64Audio = Buffer.from(fileBuffer).toString('base64'); session . sendRealtimeInput ( { audio : { data : base64Audio , mimeType : "audio/pcm;rate=16000" } } ); const turns = await handleTurn (); // Combine audio data strings and save as wave file const combinedAudio = turns . reduce (( acc , turn ) = > { if ( turn . data ) { const buffer = Buffer . from ( turn . data , 'base64' ); const intArray = new Int16Array ( buffer . buffer , buffer . byteOffset , buffer . byteLength / Int16Array . BYTES_PER_ELEMENT ); return acc . concat ( Array . from ( intArray )); } return acc ; }, []); const audioBuffer = new Int16Array ( combinedAudio ); const wf = new WaveFile (); wf . fromScratch ( 1 , 24000 , '16' , audioBuffer ); // output is 24kHz fs . writeFileSync ( 'audio.wav' , wf . toBuffer ()); session . close (); } async function main () { await live (). catch (( e ) = > console . error ( 'got error' , e )); } main (); What's next Read the full Live API Capabilities guide for key capabilities and configurations; including Voice Activity Detection and native audio features. Read the Tool use guide to learn how to integrate Live API with tools and function calling. Read the Session management guide for managing long running conversations. Read the Ephemeral tokens guide for secure authentication in client-to-server applications. For more information about the underlying WebSockets API, see the WebSockets API reference . Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-08 UTC. \ No newline at end of file diff --git a/text_content/docs_long-context_3887bd03.txt b/text_content/docs_long-context_3887bd03.txt new file mode 100644 index 0000000000000000000000000000000000000000..50bc5f347bfaf47579814e7fa6ac707e8efdeec9 --- /dev/null +++ b/text_content/docs_long-context_3887bd03.txt @@ -0,0 +1,5 @@ +URL: https://ai.google.dev/gemini-api/docs/long-context#main-content +Title: Long context  |  Gemini API  |  Google AI for Developers +================================================== + +Long context | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Long context Many Gemini models come with large context windows of 1 million or more tokens. Historically, large language models (LLMs) were significantly limited by the amount of text (or tokens) that could be passed to the model at one time. The Gemini long context window unlocks many new use cases and developer paradigms. The code you already use for cases like text generation or multimodal inputs will work without any changes with long context. This document gives you an overview of what you can achieve using models with context windows of 1M and more tokens. The page gives a brief overview of a context window, and explores how developers should think about long context, various real world use cases for long context, and ways to optimize the usage of long context. For the context window sizes of specific models, see the Models page. What is a context window? The basic way you use the Gemini models is by passing information (context) to the model, which will subsequently generate a response. An analogy for the context window is short term memory. There is a limited amount of information that can be stored in someone's short term memory, and the same is true for generative models. You can read more about how models work under the hood in our generative models guide . Getting started with long context Earlier versions of generative models were only able to process 8,000 tokens at a time. Newer models pushed this further by accepting 32,000 or even 128,000 tokens. Gemini is the first model capable of accepting 1 million tokens. In practice, 1 million tokens would look like: 50,000 lines of code (with the standard 80 characters per line) All the text messages you have sent in the last 5 years 8 average length English novels Transcripts of over 200 average length podcast episodes The more limited context windows common in many other models often require strategies like arbitrarily dropping old messages, summarizing content, using RAG with vector databases, or filtering prompts to save tokens. While these techniques remain valuable in specific scenarios, Gemini's extensive context window invites a more direct approach: providing all relevant information upfront. Because Gemini models were purpose-built with massive context capabilities, they demonstrate powerful in-context learning. For example, using only in-context instructional materials (a 500-page reference grammar, a dictionary, and ≈400 parallel sentences), Gemini learned to translate from English to Kalamang—a Papuan language with fewer than 200 speakers—with quality similar to a human learner using the same materials. This illustrates the paradigm shift enabled by Gemini's long context, empowering new possibilities through robust in-context learning. Long context use cases While the standard use case for most generative models is still text input, the Gemini model family enables a new paradigm of multimodal use cases. These models can natively understand text, video, audio, and images. They are accompanied by the Gemini API that takes in multimodal file types for convenience. Long form text Text has proved to be the layer of intelligence underpinning much of the momentum around LLMs. As mentioned earlier, much of the practical limitation of LLMs was because of not having a large enough context window to do certain tasks. This led to the rapid adoption of retrieval augmented generation (RAG) and other techniques which dynamically provide the model with relevant contextual information. Now, with larger and larger context windows, there are new techniques becoming available which unlock new use cases. Some emerging and standard use cases for text based long context include: Summarizing large corpuses of text Previous summarization options with smaller context models would require a sliding window or another technique to keep state of previous sections as new tokens are passed to the model Question and answering Historically this was only possible with RAG given the limited amount of context and models' factual recall being low Agentic workflows Text is the underpinning of how agents keep state of what they have done and what they need to do; not having enough information about the world and the agent's goal is a limitation on the reliability of agents Many-shot in-context learning is one of the most unique capabilities unlocked by long context models. Research has shown that taking the common "single shot" or "multi-shot" example paradigm, where the model is presented with one or a few examples of a task, and scaling that up to hundreds, thousands, or even hundreds of thousands of examples, can lead to novel model capabilities. This many-shot approach has also been shown to perform similarly to models which were fine-tuned for a specific task. For use cases where a Gemini model's performance is not yet sufficient for a production rollout, you can try the many-shot approach. As you might explore later in the long context optimization section, context caching makes this type of high input token workload much more economically feasible and even lower latency in some cases. Long form video Video content's utility has long been constrained by the lack of accessibility of the medium itself. It was hard to skim the content, transcripts often failed to capture the nuance of a video, and most tools don't process image, text, and audio together. With Gemini, the long-context text capabilities translate to the ability to reason and answer questions about multimodal inputs with sustained performance. Some emerging and standard use cases for video long context include: Video question and answering Video memory, as shown with Google's Project Astra Video captioning Video recommendation systems, by enriching existing metadata with new multimodal understanding Video customization, by looking at a corpus of data and associated video metadata and then removing parts of videos that are not relevant to the viewer Video content moderation Real-time video processing When working with videos, it is important to consider how the videos are processed into tokens , which affects billing and usage limits. You can learn more about prompting with video files in the Prompting guide . Long form audio The Gemini models were the first natively multimodal large language models that could understand audio. Historically, the typical developer workflow would involve stringing together multiple domain specific models, like a speech-to-text model and a text-to-text model, in order to process audio. This led to additional latency required by performing multiple round-trip requests and decreased performance usually attributed to disconnected architectures of the multiple model setup. Some emerging and standard use cases for audio context include: Real-time transcription and translation Podcast / video question and answering Meeting transcription and summarization Voice assistants You can learn more about prompting with audio files in the Prompting guide . Long context optimizations The primary optimization when working with long context and the Gemini models is to use context caching . Beyond the previous impossibility of processing lots of tokens in a single request, the other main constraint was the cost. If you have a "chat with your data" app where a user uploads 10 PDFs, a video, and some work documents, you would historically have to work with a more complex retrieval augmented generation (RAG) tool / framework in order to process these requests and pay a significant amount for tokens moved into the context window. Now, you can cache the files the user uploads and pay to store them on a per hour basis. The input / output cost per request with Gemini Flash for example is ~4x less than the standard input / output cost, so if the user chats with their data enough, it becomes a huge cost saving for you as the developer. Long context limitations In various sections of this guide, we talked about how Gemini models achieve high performance across various needle-in-a-haystack retrieval evals. These tests consider the most basic setup, where you have a single needle you are looking for. In cases where you might have multiple "needles" or specific pieces of information you are looking for, the model does not perform with the same accuracy. Performance can vary to a wide degree depending on the context. This is important to consider as there is an inherent tradeoff between getting the right information retrieved and cost. You can get ~99% on a single query, but you have to pay the input token cost every time you send that query. So for 100 pieces of information to be retrieved, if you needed 99% performance, you would likely need to send 100 requests. This is a good example of where context caching can significantly reduce the cost associated with using Gemini models while keeping the performance high. FAQs Where is the best place to put my query in the context window? In most cases, especially if the total context is long, the model's performance will be better if you put your query / question at the end of the prompt (after all the other context). Do I lose model performance when I add more tokens to a query? Generally, if you don't need tokens to be passed to the model, it is best to avoid passing them. However, if you have a large chunk of tokens with some information and want to ask questions about that information, the model is highly capable of extracting that information (up to 99% accuracy in many cases). How can I lower my cost with long-context queries? If you have a similar set of tokens / context that you want to re-use many times, context caching can help reduce the costs associated with asking questions about that information. Does the context length affect the model latency? There is some fixed amount of latency in any given request, regardless of the size, but generally longer queries will have higher latency (time to first token). Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-05-20 UTC. \ No newline at end of file diff --git a/text_content/docs_long-context_8efe9dc8.txt b/text_content/docs_long-context_8efe9dc8.txt new file mode 100644 index 0000000000000000000000000000000000000000..e1ce300def5287021d1e66be73772cead1ce8e5e --- /dev/null +++ b/text_content/docs_long-context_8efe9dc8.txt @@ -0,0 +1,5 @@ +URL: https://ai.google.dev/gemini-api/docs/long-context +Title: Long context  |  Gemini API  |  Google AI for Developers +================================================== + +Long context | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Long context Many Gemini models come with large context windows of 1 million or more tokens. Historically, large language models (LLMs) were significantly limited by the amount of text (or tokens) that could be passed to the model at one time. The Gemini long context window unlocks many new use cases and developer paradigms. The code you already use for cases like text generation or multimodal inputs will work without any changes with long context. This document gives you an overview of what you can achieve using models with context windows of 1M and more tokens. The page gives a brief overview of a context window, and explores how developers should think about long context, various real world use cases for long context, and ways to optimize the usage of long context. For the context window sizes of specific models, see the Models page. What is a context window? The basic way you use the Gemini models is by passing information (context) to the model, which will subsequently generate a response. An analogy for the context window is short term memory. There is a limited amount of information that can be stored in someone's short term memory, and the same is true for generative models. You can read more about how models work under the hood in our generative models guide . Getting started with long context Earlier versions of generative models were only able to process 8,000 tokens at a time. Newer models pushed this further by accepting 32,000 or even 128,000 tokens. Gemini is the first model capable of accepting 1 million tokens. In practice, 1 million tokens would look like: 50,000 lines of code (with the standard 80 characters per line) All the text messages you have sent in the last 5 years 8 average length English novels Transcripts of over 200 average length podcast episodes The more limited context windows common in many other models often require strategies like arbitrarily dropping old messages, summarizing content, using RAG with vector databases, or filtering prompts to save tokens. While these techniques remain valuable in specific scenarios, Gemini's extensive context window invites a more direct approach: providing all relevant information upfront. Because Gemini models were purpose-built with massive context capabilities, they demonstrate powerful in-context learning. For example, using only in-context instructional materials (a 500-page reference grammar, a dictionary, and ≈400 parallel sentences), Gemini learned to translate from English to Kalamang—a Papuan language with fewer than 200 speakers—with quality similar to a human learner using the same materials. This illustrates the paradigm shift enabled by Gemini's long context, empowering new possibilities through robust in-context learning. Long context use cases While the standard use case for most generative models is still text input, the Gemini model family enables a new paradigm of multimodal use cases. These models can natively understand text, video, audio, and images. They are accompanied by the Gemini API that takes in multimodal file types for convenience. Long form text Text has proved to be the layer of intelligence underpinning much of the momentum around LLMs. As mentioned earlier, much of the practical limitation of LLMs was because of not having a large enough context window to do certain tasks. This led to the rapid adoption of retrieval augmented generation (RAG) and other techniques which dynamically provide the model with relevant contextual information. Now, with larger and larger context windows, there are new techniques becoming available which unlock new use cases. Some emerging and standard use cases for text based long context include: Summarizing large corpuses of text Previous summarization options with smaller context models would require a sliding window or another technique to keep state of previous sections as new tokens are passed to the model Question and answering Historically this was only possible with RAG given the limited amount of context and models' factual recall being low Agentic workflows Text is the underpinning of how agents keep state of what they have done and what they need to do; not having enough information about the world and the agent's goal is a limitation on the reliability of agents Many-shot in-context learning is one of the most unique capabilities unlocked by long context models. Research has shown that taking the common "single shot" or "multi-shot" example paradigm, where the model is presented with one or a few examples of a task, and scaling that up to hundreds, thousands, or even hundreds of thousands of examples, can lead to novel model capabilities. This many-shot approach has also been shown to perform similarly to models which were fine-tuned for a specific task. For use cases where a Gemini model's performance is not yet sufficient for a production rollout, you can try the many-shot approach. As you might explore later in the long context optimization section, context caching makes this type of high input token workload much more economically feasible and even lower latency in some cases. Long form video Video content's utility has long been constrained by the lack of accessibility of the medium itself. It was hard to skim the content, transcripts often failed to capture the nuance of a video, and most tools don't process image, text, and audio together. With Gemini, the long-context text capabilities translate to the ability to reason and answer questions about multimodal inputs with sustained performance. Some emerging and standard use cases for video long context include: Video question and answering Video memory, as shown with Google's Project Astra Video captioning Video recommendation systems, by enriching existing metadata with new multimodal understanding Video customization, by looking at a corpus of data and associated video metadata and then removing parts of videos that are not relevant to the viewer Video content moderation Real-time video processing When working with videos, it is important to consider how the videos are processed into tokens , which affects billing and usage limits. You can learn more about prompting with video files in the Prompting guide . Long form audio The Gemini models were the first natively multimodal large language models that could understand audio. Historically, the typical developer workflow would involve stringing together multiple domain specific models, like a speech-to-text model and a text-to-text model, in order to process audio. This led to additional latency required by performing multiple round-trip requests and decreased performance usually attributed to disconnected architectures of the multiple model setup. Some emerging and standard use cases for audio context include: Real-time transcription and translation Podcast / video question and answering Meeting transcription and summarization Voice assistants You can learn more about prompting with audio files in the Prompting guide . Long context optimizations The primary optimization when working with long context and the Gemini models is to use context caching . Beyond the previous impossibility of processing lots of tokens in a single request, the other main constraint was the cost. If you have a "chat with your data" app where a user uploads 10 PDFs, a video, and some work documents, you would historically have to work with a more complex retrieval augmented generation (RAG) tool / framework in order to process these requests and pay a significant amount for tokens moved into the context window. Now, you can cache the files the user uploads and pay to store them on a per hour basis. The input / output cost per request with Gemini Flash for example is ~4x less than the standard input / output cost, so if the user chats with their data enough, it becomes a huge cost saving for you as the developer. Long context limitations In various sections of this guide, we talked about how Gemini models achieve high performance across various needle-in-a-haystack retrieval evals. These tests consider the most basic setup, where you have a single needle you are looking for. In cases where you might have multiple "needles" or specific pieces of information you are looking for, the model does not perform with the same accuracy. Performance can vary to a wide degree depending on the context. This is important to consider as there is an inherent tradeoff between getting the right information retrieved and cost. You can get ~99% on a single query, but you have to pay the input token cost every time you send that query. So for 100 pieces of information to be retrieved, if you needed 99% performance, you would likely need to send 100 requests. This is a good example of where context caching can significantly reduce the cost associated with using Gemini models while keeping the performance high. FAQs Where is the best place to put my query in the context window? In most cases, especially if the total context is long, the model's performance will be better if you put your query / question at the end of the prompt (after all the other context). Do I lose model performance when I add more tokens to a query? Generally, if you don't need tokens to be passed to the model, it is best to avoid passing them. However, if you have a large chunk of tokens with some information and want to ask questions about that information, the model is highly capable of extracting that information (up to 99% accuracy in many cases). How can I lower my cost with long-context queries? If you have a similar set of tokens / context that you want to re-use many times, context caching can help reduce the costs associated with asking questions about that information. Does the context length affect the model latency? There is some fixed amount of latency in any given request, regardless of the size, but generally longer queries will have higher latency (time to first token). Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-05-20 UTC. \ No newline at end of file diff --git a/text_content/docs_migrate-to-cloud_c3e6093b.txt b/text_content/docs_migrate-to-cloud_c3e6093b.txt new file mode 100644 index 0000000000000000000000000000000000000000..57649b8130da0c01f48f0967c2e8bea301b8f5bb --- /dev/null +++ b/text_content/docs_migrate-to-cloud_c3e6093b.txt @@ -0,0 +1,5 @@ +URL: https://ai.google.dev/gemini-api/docs/migrate-to-cloud#main-content +Title: Gemini Developer API v.s. Vertex AI  |  Gemini API  |  Google AI for Developers +================================================== + +Gemini Developer API v.s. Vertex AI | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Gemini Developer API v.s. Vertex AI When developing generative AI solutions with Gemini, Google offers two API products: the Gemini Developer API and the Vertex AI Gemini API . The Gemini Developer API provides the fastest path to build, productionize, and scale Gemini powered applications. Most developers should use the Gemini Developer API unless there is a need for specific enterprise controls. Vertex AI offers a comprehensive ecosystem of enterprise ready features and services for building and deploying generative AI applications backed by the Google Cloud Platform. We've recently simplified migrating between these services. Both the Gemini Developer API and the Vertex AI Gemini API are now accessible through the unified Google Gen AI SDK . Code comparison This page has side-by-side code comparisons between Gemini Developer API and Vertex AI quickstarts for text generation. Python You can access both the Gemini Developer API and Vertex AI services through the google-genai library. See the libraries page for instructions on how to install google-genai . Gemini Developer API from google import genai client = genai . Client () response = client . models . generate_content ( model = "gemini-2.0-flash" , contents = "Explain how AI works in a few words" ) print ( response . text ) Vertex AI Gemini API from google import genai client = genai . Client ( vertexai = True , project = 'your-project-id' , location = 'us-central1' ) response = client . models . generate_content ( model = "gemini-2.0-flash" , contents = "Explain how AI works in a few words" ) print ( response . text ) JavaScript and TypeScript You can access both Gemini Developer API and Vertex AI services through @google/genai library. See libraries page for instructions on how to install @google/genai . Gemini Developer API import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const response = await ai . models . generateContent ({ model : "gemini-2.0-flash" , contents : "Explain how AI works in a few words" , }); console . log ( response . text ); } main (); Vertex AI Gemini API import { GoogleGenAI } from '@google/genai' ; const ai = new GoogleGenAI ({ vertexai : true , project : 'your_project' , location : 'your_location' , }); async function main () { const response = await ai . models . generateContent ({ model : "gemini-2.0-flash" , contents : "Explain how AI works in a few words" , }); console . log ( response . text ); } main (); Go You can access both Gemini Developer API and Vertex AI services through google.golang.org/genai library. See libraries page for instructions on how to install google.golang.org/genai . Gemini Developer API import ( "context" "encoding/json" "fmt" "log" "google.golang.org/genai" ) // Your Google API key const apiKey = "your-api-key" func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } // Call the GenerateContent method. result , err := client . Models . GenerateContent ( ctx , "gemini-2.0-flash" , genai . Text ( "Tell me about New York?" ), nil ) } Vertex AI Gemini API import ( "context" "encoding/json" "fmt" "log" "google.golang.org/genai" ) // Your GCP project const project = "your-project" // A GCP location like "us-central1" const location = "some-gcp-location" func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , & genai . ClientConfig { Project : project , Location : location , Backend : genai . BackendVertexAI , }) // Call the GenerateContent method. result , err := client . Models . GenerateContent ( ctx , "gemini-2.0-flash" , genai . Text ( "Tell me about New York?" ), nil ) } Other use cases and platforms Refer to use case specific guides on Gemini Developer API Documentation and Vertex AI documentation for other platforms and use cases. Migration considerations When you migrate: You'll need to use Google Cloud service accounts to authenticate. See the Vertex AI documentation for more information. You can use your existing Google Cloud project (the same one you used to generate your API key) or you can create a new Google Cloud project . Supported regions may differ between the Gemini Developer API and the Vertex AI Gemini API. See the list of supported regions for generative AI on Google Cloud . Any models you created in Google AI Studio need to be retrained in Vertex AI. If you no longer need to use your Gemini API key for the Gemini Developer API, then follow security best practices and delete it. To delete an API key: Open the Google Cloud API Credentials page. Find the API key you want to delete and click the Actions icon. Select Delete API key . In the Delete credential modal, select Delete . Deleting an API key takes a few minutes to propagate. After propagation completes, any traffic using the deleted API key is rejected. Important: If you have deleted a key that is still used in production and need to recover it, see gcloud beta services api-keys undelete . Next steps See the Generative AI on Vertex AI overview to learn more about generative AI solutions on Vertex AI. Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-06-26 UTC. \ No newline at end of file diff --git a/text_content/docs_migrate_55bb67b4.txt b/text_content/docs_migrate_55bb67b4.txt new file mode 100644 index 0000000000000000000000000000000000000000..7dd6d92958667ac91213174b8dcaa901bc308343 --- /dev/null +++ b/text_content/docs_migrate_55bb67b4.txt @@ -0,0 +1,5 @@ +URL: https://ai.google.dev/gemini-api/docs/migrate#client +Title: Migrate to the Google GenAI SDK  |  Gemini API  |  Google AI for Developers +================================================== + +Migrate to the Google GenAI SDK | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Migrate to the Google GenAI SDK Starting with the Gemini 2.0 release in late 2024, we introduced a new set of libraries called the Google GenAI SDK . It offers an improved developer experience through an updated client architecture , and simplifies the transition between developer and enterprise workflows. The Google GenAI SDK is now in General Availability (GA) across all supported platforms. If you're using one of our legacy libraries , we strongly recommend you to migrate. This guide provides before-and-after examples of migrated code to help you get started. Note: The Go examples omit imports and other boilerplate code to improve readability. Installation Before Python pip install -U -q "google-generativeai" JavaScript npm install @google/generative-ai Go go get github.com/google/generative-ai-go After Python pip install -U -q "google-genai" JavaScript npm install @google/genai Go go get google.golang.org/genai API access The old SDK implicitly handled the API client behind the scenes using a variety of ad hoc methods. This made it hard to manage the client and credentials. Now, you interact through a central Client object. This Client object acts as a single entry point for various API services (e.g., models , chats , files , tunings ), promoting consistency and simplifying credential and configuration management across different API calls. Before (Less Centralized API Access) Python The old SDK didn't explicitly use a top-level client object for most API calls. You would directly instantiate and interact with GenerativeModel objects. import google.generativeai as genai # Directly create and use model objects model = genai . GenerativeModel ( 'gemini-1.5-flash' ) response = model . generate_content ( ... ) chat = model . start_chat ( ... ) JavaScript While GoogleGenerativeAI was a central point for models and chat, other functionalities like file and cache management often required importing and instantiating entirely separate client classes. import { GoogleGenerativeAI } from "@google/generative-ai" ; import { GoogleAIFileManager , GoogleAICacheManager } from "@google/generative-ai/server" ; // For files/caching const genAI = new GoogleGenerativeAI ( "YOUR_API_KEY" ); const fileManager = new GoogleAIFileManager ( "YOUR_API_KEY" ); const cacheManager = new GoogleAICacheManager ( "YOUR_API_KEY" ); // Get a model instance, then call methods on it const model = genAI . getGenerativeModel ({ model : "gemini-1.5-flash" }); const result = await model . generateContent (...); const chat = model . startChat (...); // Call methods on separate client objects for other services const uploadedFile = await fileManager . uploadFile (...); const cache = await cacheManager . create (...); Go The genai.NewClient function created a client, but generative model operations were typically called on a separate GenerativeModel instance obtained from this client. Other services might have been accessed via distinct packages or patterns. import ( "github.com/google/generative-ai-go/genai" "github.com/google/generative-ai-go/genai/fileman" // For files "google.golang.org/api/option" ) client , err := genai . NewClient ( ctx , option . WithAPIKey ( "YOUR_API_KEY" )) fileClient , err := fileman . NewClient ( ctx , option . WithAPIKey ( "YOUR_API_KEY" )) // Get a model instance, then call methods on it model := client . GenerativeModel ( "gemini-1.5-flash" ) resp , err := model . GenerateContent ( ... ) cs := model . StartChat () // Call methods on separate client objects for other services uploadedFile , err := fileClient . UploadFile ( ... ) After (Centralized Client Object) Python from google import genai # Create a single client object client = genai . Client () # Access API methods through services on the client object response = client . models . generate_content ( ... ) chat = client . chats . create ( ... ) my_file = client . files . upload ( ... ) tuning_job = client . tunings . tune ( ... ) JavaScript import { GoogleGenAI } from "@google/genai" ; // Create a single client object const ai = new GoogleGenAI ({ apiKey : "YOUR_API_KEY" }); // Access API methods through services on the client object const response = await ai . models . generateContent (...); const chat = ai . chats . create (...); const uploadedFile = await ai . files . upload (...); const cache = await ai . caches . create (...); Go import "google.golang.org/genai" // Create a single client object client , err := genai . NewClient ( ctx , nil ) // Access API methods through services on the client object result , err := client . Models . GenerateContent ( ... ) chat , err := client . Chats . Create ( ... ) uploadedFile , err := client . Files . Upload ( ... ) tuningJob , err := client . Tunings . Tune ( ... ) Authentication Both legacy and new libraries authenticate using API keys. You can create your API key in Google AI Studio. Before Python The old SDK handled the API client object implicitly. import google.generativeai as genai genai . configure ( api_key =... ) JavaScript import { GoogleGenerativeAI } from "@google/generative-ai" ; const genAI = new GoogleGenerativeAI ( "GOOGLE_API_KEY" ); Go Import the Google libraries: import ( "github.com/google/generative-ai-go/genai" "google.golang.org/api/option" ) Create the client: client , err := genai . NewClient ( ctx , option . WithAPIKey ( "GOOGLE_API_KEY" )) After Python With Google GenAI SDK, you create an API client first, which is used to call the API. The new SDK will pick up your API key from either one of the GEMINI_API_KEY or GOOGLE_API_KEY environment variables, if you don't pass one to the client. export GEMINI_API_KEY = "YOUR_API_KEY" from google import genai client = genai . Client () # Set the API key using the GEMINI_API_KEY env var. # Alternatively, you could set the API key explicitly: # client = genai.Client(api_key="your_api_key") JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({ apiKey : "GEMINI_API_KEY" }); Go Import the GenAI library: import "google.golang.org/genai" Create the client: client , err := genai . NewClient ( ctx , & genai . ClientConfig { Backend : genai . BackendGeminiAPI , }) Generate content Text Before Python Previously, there were no client objects, you accessed APIs directly through GenerativeModel objects. import google.generativeai as genai model = genai . GenerativeModel ( 'gemini-1.5-flash' ) response = model . generate_content ( 'Tell me a story in 300 words' ) print ( response . text ) JavaScript import { GoogleGenerativeAI } from "@google/generative-ai" ; const genAI = new GoogleGenerativeAI ( process . env . API_KEY ); const model = genAI . getGenerativeModel ({ model : "gemini-1.5-flash" }); const prompt = "Tell me a story in 300 words" ; const result = await model . generateContent ( prompt ); console . log ( result . response . text ()); Go ctx := context . Background () client , err := genai . NewClient ( ctx , option . WithAPIKey ( "GOOGLE_API_KEY" )) if err != nil { log . Fatal ( err ) } defer client . Close () model := client . GenerativeModel ( "gemini-1.5-flash" ) resp , err := model . GenerateContent ( ctx , genai . Text ( "Tell me a story in 300 words." )) if err != nil { log . Fatal ( err ) } printResponse ( resp ) // utility for printing response parts After Python The new Google GenAI SDK provides access to all the API methods through the Client object. Except for a few stateful special cases ( chat and live-api session s), these are all stateless functions. For utility and uniformity, objects returned are pydantic classes. from google import genai client = genai . Client () response = client . models . generate_content ( model = 'gemini-2.0-flash' , contents = 'Tell me a story in 300 words.' ) print ( response . text ) print ( response . model_dump_json ( exclude_none = True , indent = 4 )) JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({ apiKey : "GOOGLE_API_KEY" }); const response = await ai . models . generateContent ({ model : "gemini-2.0-flash" , contents : "Tell me a story in 300 words." , }); console . log ( response . text ); Go ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } result , err := client . Models . GenerateContent ( ctx , "gemini-2.0-flash" , genai . Text ( "Tell me a story in 300 words." ), nil ) if err != nil { log . Fatal ( err ) } debugPrint ( result ) // utility for printing result Image Before Python import google.generativeai as genai model = genai . GenerativeModel ( 'gemini-1.5-flash' ) response = model . generate_content ([ 'Tell me a story based on this image' , Image . open ( image_path ) ]) print ( response . text ) JavaScript import { GoogleGenerativeAI } from "@google/generative-ai" ; const genAI = new GoogleGenerativeAI ( "GOOGLE_API_KEY" ); const model = genAI . getGenerativeModel ({ model : "gemini-1.5-flash" }); function fileToGenerativePart ( path , mimeType ) { return { inlineData : { data : Buffer . from ( fs . readFileSync ( path )). toString ( "base64" ), mimeType , }, }; } const prompt = "Tell me a story based on this image" ; const imagePart = fileToGenerativePart ( `path/to/organ.jpg` , "image/jpeg" , ); const result = await model . generateContent ([ prompt , imagePart ]); console . log ( result . response . text ()); Go ctx := context . Background () client , err := genai . NewClient ( ctx , option . WithAPIKey ( "GOOGLE_API_KEY" )) if err != nil { log . Fatal ( err ) } defer client . Close () model := client . GenerativeModel ( "gemini-1.5-flash" ) imgData , err := os . ReadFile ( "path/to/organ.jpg" ) if err != nil { log . Fatal ( err ) } resp , err := model . GenerateContent ( ctx , genai . Text ( "Tell me about this instrument" ), genai . ImageData ( "jpeg" , imgData )) if err != nil { log . Fatal ( err ) } printResponse ( resp ) // utility for printing response After Python Many of the same convenience features exist in the new SDK. For example, PIL.Image objects are automatically converted. from google import genai from PIL import Image client = genai . Client () response = client . models . generate_content ( model = 'gemini-2.0-flash' , contents = [ 'Tell me a story based on this image' , Image . open ( image_path ) ] ) print ( response . text ) JavaScript import { GoogleGenAI } from '@google/genai' ; const ai = new GoogleGenAI ({ apiKey : "GOOGLE_API_KEY" }); const organ = await ai . files . upload ({ file : "path/to/organ.jpg" , }); const response = await ai . models . generateContent ({ model : "gemini-2.0-flash" , contents : [ createUserContent ([ "Tell me a story based on this image" , createPartFromUri ( organ . uri , organ . mimeType ) ]), ], }); console . log ( response . text ); Go ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } imgData , err := os . ReadFile ( "path/to/organ.jpg" ) if err != nil { log . Fatal ( err ) } parts := [] * genai . Part { { Text : "Tell me a story based on this image" }, { InlineData : & genai . Blob { Data : imgData , MIMEType : "image/jpeg" }}, } contents := [] * genai . Content { { Parts : parts }, } result , err := client . Models . GenerateContent ( ctx , "gemini-2.0-flash" , contents , nil ) if err != nil { log . Fatal ( err ) } debugPrint ( result ) // utility for printing result Streaming Before Python import google.generativeai as genai response = model . generate_content ( "Write a cute story about cats." , stream = True ) for chunk in response : print ( chunk . text ) JavaScript import { GoogleGenerativeAI } from "@google/generative-ai" ; const genAI = new GoogleGenerativeAI ( "GOOGLE_API_KEY" ); const model = genAI . getGenerativeModel ({ model : "gemini-1.5-flash" }); const prompt = "Write a story about a magic backpack." ; const result = await model . generateContentStream ( prompt ); // Print text as it comes in. for await ( const chunk of result . stream ) { const chunkText = chunk . text (); process . stdout . write ( chunkText ); } Go ctx := context . Background () client , err := genai . NewClient ( ctx , option . WithAPIKey ( "GOOGLE_API_KEY" )) if err != nil { log . Fatal ( err ) } defer client . Close () model := client . GenerativeModel ( "gemini-1.5-flash" ) iter := model . GenerateContentStream ( ctx , genai . Text ( "Write a story about a magic backpack." )) for { resp , err := iter . Next () if err == iterator . Done { break } if err != nil { log . Fatal ( err ) } printResponse ( resp ) // utility for printing the response } After Python from google import genai client = genai . Client () for chunk in client . models . generate_content_stream ( model = 'gemini-2.0-flash' , contents = 'Tell me a story in 300 words.' ): print ( chunk . text ) JavaScript import { GoogleGenAI } from '@google/genai' ; const ai = new GoogleGenAI ({ apiKey : "GOOGLE_API_KEY" }); const response = await ai . models . generateContentStream ({ model : "gemini-2.0-flash" , contents : "Write a story about a magic backpack." , }); let text = "" ; for await ( const chunk of response ) { console . log ( chunk . text ); text += chunk . text ; } Go ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } for result , err := range client . Models . GenerateContentStream ( ctx , "gemini-2.0-flash" , genai . Text ( "Write a story about a magic backpack." ), nil , ) { if err != nil { log . Fatal ( err ) } fmt . Print ( result . Candidates [ 0 ]. Content . Parts [ 0 ]. Text ) } Configuration Before Python import google.generativeai as genai model = genai . GenerativeModel ( 'gemini-1.5-flash' , system_instruction = 'you are a story teller for kids under 5 years old' , generation_config = genai . GenerationConfig ( max_output_tokens = 400 , top_k = 2 , top_p = 0.5 , temperature = 0.5 , response_mime_type = 'application/json' , stop_sequences = [ ' \n ' ], ) ) response = model . generate_content ( 'tell me a story in 100 words' ) JavaScript import { GoogleGenerativeAI } from "@google/generative-ai" ; const genAI = new GoogleGenerativeAI ( "GOOGLE_API_KEY" ); const model = genAI . getGenerativeModel ({ model : "gemini-1.5-flash" , generationConfig : { candidateCount : 1 , stopSequences : [ "x" ], maxOutputTokens : 20 , temperature : 1.0 , }, }); const result = await model . generateContent ( "Tell me a story about a magic backpack." , ); console . log ( result . response . text ()) Go ctx := context . Background () client , err := genai . NewClient ( ctx , option . WithAPIKey ( "GOOGLE_API_KEY" )) if err != nil { log . Fatal ( err ) } defer client . Close () model := client . GenerativeModel ( "gemini-1.5-flash" ) model . SetTemperature ( 0.5 ) model . SetTopP ( 0.5 ) model . SetTopK ( 2.0 ) model . SetMaxOutputTokens ( 100 ) model . ResponseMIMEType = "application/json" resp , err := model . GenerateContent ( ctx , genai . Text ( "Tell me about New York" )) if err != nil { log . Fatal ( err ) } printResponse ( resp ) // utility for printing response After Python For all methods in the new SDK, the required arguments are provided as keyword arguments. All optional inputs are provided in the config argument. Config arguments can be specified as either Python dictionaries or Config classes in the google.genai.types namespace. For utility and uniformity, all definitions within the types module are pydantic classes. from google import genai from google.genai import types client = genai . Client () response = client . models . generate_content ( model = 'gemini-2.0-flash' , contents = 'Tell me a story in 100 words.' , config = types . GenerateContentConfig ( system_instruction = 'you are a story teller for kids under 5 years old' , max_output_tokens = 400 , top_k = 2 , top_p = 0.5 , temperature = 0.5 , response_mime_type = 'application/json' , stop_sequences = [ ' \n ' ], seed = 42 , ), ) JavaScript import { GoogleGenAI } from '@google/genai' ; const ai = new GoogleGenAI ({ apiKey : "GOOGLE_API_KEY" }); const response = await ai . models . generateContent ({ model : "gemini-2.0-flash" , contents : "Tell me a story about a magic backpack." , config : { candidateCount : 1 , stopSequences : [ "x" ], maxOutputTokens : 20 , temperature : 1.0 , }, }); console . log ( response . text ); Go ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } result , err := client . Models . GenerateContent ( ctx , "gemini-2.0-flash" , genai . Text ( "Tell me about New York" ), & genai . GenerateContentConfig { Temperature : genai . Ptr [ float32 ]( 0.5 ), TopP : genai . Ptr [ float32 ]( 0.5 ), TopK : genai . Ptr [ float32 ]( 2.0 ), ResponseMIMEType : "application/json" , StopSequences : [] string { "Yankees" }, CandidateCount : 2 , Seed : genai . Ptr [ int32 ]( 42 ), MaxOutputTokens : 128 , PresencePenalty : genai . Ptr [ float32 ]( 0.5 ), FrequencyPenalty : genai . Ptr [ float32 ]( 0.5 ), }, ) if err != nil { log . Fatal ( err ) } debugPrint ( result ) // utility for printing response Safety settings Generate a response with safety settings: Before Python import google.generativeai as genai model = genai . GenerativeModel ( 'gemini-1.5-flash' ) response = model . generate_content ( 'say something bad' , safety_settings = { 'HATE' : 'BLOCK_ONLY_HIGH' , 'HARASSMENT' : 'BLOCK_ONLY_HIGH' , } ) JavaScript import { GoogleGenerativeAI , HarmCategory , HarmBlockThreshold } from "@google/generative-ai" ; const genAI = new GoogleGenerativeAI ( "GOOGLE_API_KEY" ); const model = genAI . getGenerativeModel ({ model : "gemini-1.5-flash" , safetySettings : [ { category : HarmCategory . HARM_CATEGORY_HARASSMENT , threshold : HarmBlockThreshold . BLOCK_LOW_AND_ABOVE , }, ], }); const unsafePrompt = "I support Martians Soccer Club and I think " + "Jupiterians Football Club sucks! Write an ironic phrase telling " + "them how I feel about them." ; const result = await model . generateContent ( unsafePrompt ); try { result . response . text (); } catch ( e ) { console . error ( e ); console . log ( result . response . candidates [ 0 ]. safetyRatings ); } After Python from google import genai from google.genai import types client = genai . Client () response = client . models . generate_content ( model = 'gemini-2.0-flash' , contents = 'say something bad' , config = types . GenerateContentConfig ( safety_settings = [ types . SafetySetting ( category = 'HARM_CATEGORY_HATE_SPEECH' , threshold = 'BLOCK_ONLY_HIGH' ), ] ), ) JavaScript import { GoogleGenAI } from '@google/genai' ; const ai = new GoogleGenAI ({ apiKey : "GOOGLE_API_KEY" }); const unsafePrompt = "I support Martians Soccer Club and I think " + "Jupiterians Football Club sucks! Write an ironic phrase telling " + "them how I feel about them." ; const response = await ai . models . generateContent ({ model : "gemini-2.0-flash" , contents : unsafePrompt , config : { safetySettings : [ { category : "HARM_CATEGORY_HARASSMENT" , threshold : "BLOCK_ONLY_HIGH" , }, ], }, }); console . log ( "Finish reason:" , response . candidates [ 0 ]. finishReason ); console . log ( "Safety ratings:" , response . candidates [ 0 ]. safetyRatings ); Async Before Python import google.generativeai as genai model = genai . GenerativeModel ( 'gemini-1.5-flash' ) response = model . generate_content_async ( 'tell me a story in 100 words' ) After Python To use the new SDK with asyncio , there is a separate async implementation of every method under client.aio . from google import genai client = genai . Client () response = await client . aio . models . generate_content ( model = 'gemini-2.0-flash' , contents = 'Tell me a story in 300 words.' ) Chat Start a chat and send a message to the model: Before Python import google.generativeai as genai model = genai . GenerativeModel ( 'gemini-1.5-flash' ) chat = model . start_chat () response = chat . send_message ( "Tell me a story in 100 words" ) response = chat . send_message ( "What happened after that?" ) JavaScript import { GoogleGenerativeAI } from "@google/generative-ai" ; const genAI = new GoogleGenerativeAI ( "GOOGLE_API_KEY" ); const model = genAI . getGenerativeModel ({ model : "gemini-1.5-flash" }); const chat = model . startChat ({ history : [ { role : "user" , parts : [{ text : "Hello" }], }, { role : "model" , parts : [{ text : "Great to meet you. What would you like to know?" }], }, ], }); let result = await chat . sendMessage ( "I have 2 dogs in my house." ); console . log ( result . response . text ()); result = await chat . sendMessage ( "How many paws are in my house?" ); console . log ( result . response . text ()); Go ctx := context . Background () client , err := genai . NewClient ( ctx , option . WithAPIKey ( "GOOGLE_API_KEY" )) if err != nil { log . Fatal ( err ) } defer client . Close () model := client . GenerativeModel ( "gemini-1.5-flash" ) cs := model . StartChat () cs . History = [] * genai . Content { { Parts : [] genai . Part { genai . Text ( "Hello, I have 2 dogs in my house." ), }, Role : "user" , }, { Parts : [] genai . Part { genai . Text ( "Great to meet you. What would you like to know?" ), }, Role : "model" , }, } res , err := cs . SendMessage ( ctx , genai . Text ( "How many paws are in my house?" )) if err != nil { log . Fatal ( err ) } printResponse ( res ) // utility for printing the response After Python from google import genai client = genai . Client () chat = client . chats . create ( model = 'gemini-2.0-flash' ) response = chat . send_message ( message = 'Tell me a story in 100 words' ) response = chat . send_message ( message = 'What happened after that?' ) JavaScript import { GoogleGenAI } from '@google/genai' ; const ai = new GoogleGenAI ({ apiKey : "GOOGLE_API_KEY" }); const chat = ai . chats . create ({ model : "gemini-2.0-flash" , history : [ { role : "user" , parts : [{ text : "Hello" }], }, { role : "model" , parts : [{ text : "Great to meet you. What would you like to know?" }], }, ], }); const response1 = await chat . sendMessage ({ message : "I have 2 dogs in my house." , }); console . log ( "Chat response 1:" , response1 . text ); const response2 = await chat . sendMessage ({ message : "How many paws are in my house?" , }); console . log ( "Chat response 2:" , response2 . text ); Go ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } chat , err := client . Chats . Create ( ctx , "gemini-2.0-flash" , nil , nil ) if err != nil { log . Fatal ( err ) } result , err := chat . SendMessage ( ctx , genai . Part { Text : "Hello, I have 2 dogs in my house." }) if err != nil { log . Fatal ( err ) } debugPrint ( result ) // utility for printing result result , err = chat . SendMessage ( ctx , genai . Part { Text : "How many paws are in my house?" }) if err != nil { log . Fatal ( err ) } debugPrint ( result ) // utility for printing result Function calling Before Python import google.generativeai as genai from enum import Enum def get_current_weather ( location : str ) - > str : """Get the current whether in a given location. Args: location: required, The city and state, e.g. San Franciso, CA unit: celsius or fahrenheit """ print ( f 'Called with: { location =} ' ) return "23C" model = genai . GenerativeModel ( model_name = "gemini-1.5-flash" , tools = [ get_current_weather ] ) response = model . generate_content ( "What is the weather in San Francisco?" ) function_call = response . candidates [ 0 ] . parts [ 0 ] . function_call After Python In the new SDK, automatic function calling is the default. Here, you disable it. from google import genai from google.genai import types client = genai . Client () def get_current_weather ( location : str ) - > str : """Get the current whether in a given location. Args: location: required, The city and state, e.g. San Franciso, CA unit: celsius or fahrenheit """ print ( f 'Called with: { location =} ' ) return "23C" response = client . models . generate_content ( model = 'gemini-2.0-flash' , contents = "What is the weather like in Boston?" , config = types . GenerateContentConfig ( tools = [ get_current_weather ], automatic_function_calling = { 'disable' : True }, ), ) function_call = response . candidates [ 0 ] . content . parts [ 0 ] . function_call Automatic function calling Before Python The old SDK only supports automatic function calling in chat. In the new SDK this is the default behavior in generate_content . import google.generativeai as genai def get_current_weather ( city : str ) - > str : return "23C" model = genai . GenerativeModel ( model_name = "gemini-1.5-flash" , tools = [ get_current_weather ] ) chat = model . start_chat ( enable_automatic_function_calling = True ) result = chat . send_message ( "What is the weather in San Francisco?" ) After Python from google import genai from google.genai import types client = genai . Client () def get_current_weather ( city : str ) - > str : return "23C" response = client . models . generate_content ( model = 'gemini-2.0-flash' , contents = "What is the weather like in Boston?" , config = types . GenerateContentConfig ( tools = [ get_current_weather ] ), ) Code execution Code execution is a tool that allows the model to generate Python code, run it, and return the result. Before Python import google.generativeai as genai model = genai . GenerativeModel ( model_name = "gemini-1.5-flash" , tools = "code_execution" ) result = model . generate_content ( "What is the sum of the first 50 prime numbers? Generate and run code for " "the calculation, and make sure you get all 50." ) JavaScript import { GoogleGenerativeAI } from "@google/generative-ai" ; const genAI = new GoogleGenerativeAI ( "GOOGLE_API_KEY" ); const model = genAI . getGenerativeModel ({ model : "gemini-1.5-flash" , tools : [{ codeExecution : {} }], }); const result = await model . generateContent ( "What is the sum of the first 50 prime numbers? " + "Generate and run code for the calculation, and make sure you get " + "all 50." , ); console . log ( result . response . text ()); After Python from google import genai from google.genai import types client = genai . Client () response = client . models . generate_content ( model = 'gemini-2.0-flash' , contents = 'What is the sum of the first 50 prime numbers? Generate and run ' 'code for the calculation, and make sure you get all 50.' , config = types . GenerateContentConfig ( tools = [ types . Tool ( code_execution = types . ToolCodeExecution )], ), ) JavaScript import { GoogleGenAI } from '@google/genai' ; const ai = new GoogleGenAI ({ apiKey : "GOOGLE_API_KEY" }); const response = await ai . models . generateContent ({ model : "gemini-2.0-pro-exp-02-05" , contents : `Write and execute code that calculates the sum of the first 50 prime numbers. Ensure that only the executable code and its resulting output are generated.` , }); // Each part may contain text, executable code, or an execution result. for ( const part of response . candidates [ 0 ]. content . parts ) { console . log ( part ); console . log ( "\n" ); } console . log ( "-" . repeat ( 80 )); // The `.text` accessor concatenates the parts into a markdown-formatted text. console . log ( "\n" , response . text ); Search grounding GoogleSearch (Gemini>=2.0) and GoogleSearchRetrieval (Gemini < 2.0) are tools that allow the model to retrieve public web data for grounding, powered by Google. Before Python import google.generativeai as genai model = genai . GenerativeModel ( 'gemini-1.5-flash' ) response = model . generate_content ( contents = "what is the Google stock price?" , tools = 'google_search_retrieval' ) After Python from google import genai from google.genai import types client = genai . Client () response = client . models . generate_content ( model = 'gemini-2.0-flash' , contents = 'What is the Google stock price?' , config = types . GenerateContentConfig ( tools = [ types . Tool ( google_search = types . GoogleSearch () ) ] ) ) JSON response Generate answers in JSON format. Before Python By specifying a response_schema and setting response_mime_type="application/json" users can constrain the model to produce a JSON response following a given structure. import google.generativeai as genai import typing_extensions as typing class CountryInfo ( typing . TypedDict ): name : str population : int capital : str continent : str major_cities : list [ str ] gdp : int official_language : str total_area_sq_mi : int model = genai . GenerativeModel ( model_name = "gemini-1.5-flash" ) result = model . generate_content ( "Give me information of the United States" , generation_config = genai . GenerationConfig ( response_mime_type = "application/json" , response_schema = CountryInfo ), ) JavaScript import { GoogleGenerativeAI , SchemaType } from "@google/generative-ai" ; const genAI = new GoogleGenerativeAI ( "GOOGLE_API_KEY" ); const schema = { description : "List of recipes" , type : SchemaType . ARRAY , items : { type : SchemaType . OBJECT , properties : { recipeName : { type : SchemaType . STRING , description : "Name of the recipe" , nullable : false , }, }, required : [ "recipeName" ], }, }; const model = genAI . getGenerativeModel ({ model : "gemini-1.5-pro" , generationConfig : { responseMimeType : "application/json" , responseSchema : schema , }, }); const result = await model . generateContent ( "List a few popular cookie recipes." , ); console . log ( result . response . text ()); After Python The new SDK uses pydantic classes to provide the schema (although you can pass a genai.types.Schema , or equivalent dict ). When possible, the SDK will parse the returned JSON, and return the result in response.parsed . If you provided a pydantic class as the schema the SDK will convert that JSON to an instance of the class. from google import genai from pydantic import BaseModel client = genai . Client () class CountryInfo ( BaseModel ): name : str population : int capital : str continent : str major_cities : list [ str ] gdp : int official_language : str total_area_sq_mi : int response = client . models . generate_content ( model = 'gemini-2.0-flash' , contents = 'Give me information of the United States.' , config = { 'response_mime_type' : 'application/json' , 'response_schema' : CountryInfo , }, ) response . parsed JavaScript import { GoogleGenAI } from '@google/genai' ; const ai = new GoogleGenAI ({ apiKey : "GOOGLE_API_KEY" }); const response = await ai . models . generateContent ({ model : "gemini-2.0-flash" , contents : "List a few popular cookie recipes." , config : { responseMimeType : "application/json" , responseSchema : { type : "array" , items : { type : "object" , properties : { recipeName : { type : "string" }, ingredients : { type : "array" , items : { type : "string" } }, }, required : [ "recipeName" , "ingredients" ], }, }, }, }); console . log ( response . text ); Files Upload Upload a file: Before Python import requests import pathlib import google.generativeai as genai # Download file response = requests . get ( 'https://storage.googleapis.com/generativeai-downloads/data/a11.txt' ) pathlib . Path ( 'a11.txt' ) . write_text ( response . text ) file = genai . upload_file ( path = 'a11.txt' ) model = genai . GenerativeModel ( 'gemini-1.5-flash' ) response = model . generate_content ([ 'Can you summarize this file:' , my_file ]) print ( response . text ) After Python import requests import pathlib from google import genai client = genai . Client () # Download file response = requests . get ( 'https://storage.googleapis.com/generativeai-downloads/data/a11.txt' ) pathlib . Path ( 'a11.txt' ) . write_text ( response . text ) my_file = client . files . upload ( file = 'a11.txt' ) response = client . models . generate_content ( model = 'gemini-2.0-flash' , contents = [ 'Can you summarize this file:' , my_file ] ) print ( response . text ) List and get List uploaded files and get an uploaded file with a filename: Before Python import google.generativeai as genai for file in genai . list_files (): print ( file . name ) file = genai . get_file ( name = file . name ) After Python from google import genai client = genai . Client () for file in client . files . list (): print ( file . name ) file = client . files . get ( name = file . name ) Delete Delete a file: Before Python import pathlib import google.generativeai as genai pathlib . Path ( 'dummy.txt' ) . write_text ( dummy ) dummy_file = genai . upload_file ( path = 'dummy.txt' ) file = genai . delete_file ( name = dummy_file . name ) After Python import pathlib from google import genai client = genai . Client () pathlib . Path ( 'dummy.txt' ) . write_text ( dummy ) dummy_file = client . files . upload ( file = 'dummy.txt' ) response = client . files . delete ( name = dummy_file . name ) Context caching Context caching allows the user to pass the content to the model once, cache the input tokens, and then refer to the cached tokens in subsequent calls to lower the cost. Before Python import requests import pathlib import google.generativeai as genai from google.generativeai import caching # Download file response = requests . get ( 'https://storage.googleapis.com/generativeai-downloads/data/a11.txt' ) pathlib . Path ( 'a11.txt' ) . write_text ( response . text ) # Upload file document = genai . upload_file ( path = "a11.txt" ) # Create cache apollo_cache = caching . CachedContent . create ( model = "gemini-1.5-flash-001" , system_instruction = "You are an expert at analyzing transcripts." , contents = [ document ], ) # Generate response apollo_model = genai . GenerativeModel . from_cached_content ( cached_content = apollo_cache ) response = apollo_model . generate_content ( "Find a lighthearted moment from this transcript" ) JavaScript import { GoogleAICacheManager , GoogleAIFileManager } from "@google/generative-ai/server" ; import { GoogleGenerativeAI } from "@google/generative-ai" ; const cacheManager = new GoogleAICacheManager ( "GOOGLE_API_KEY" ); const fileManager = new GoogleAIFileManager ( "GOOGLE_API_KEY" ); const uploadResult = await fileManager . uploadFile ( "path/to/a11.txt" , { mimeType : "text/plain" , }); const cacheResult = await cacheManager . create ({ model : "models/gemini-1.5-flash" , contents : [ { role : "user" , parts : [ { fileData : { fileUri : uploadResult . file . uri , mimeType : uploadResult . file . mimeType , }, }, ], }, ], }); console . log ( cacheResult ); const genAI = new GoogleGenerativeAI ( "GOOGLE_API_KEY" ); const model = genAI . getGenerativeModelFromCachedContent ( cacheResult ); const result = await model . generateContent ( "Please summarize this transcript." , ); console . log ( result . response . text ()); After Python import requests import pathlib from google import genai from google.genai import types client = genai . Client () # Check which models support caching. for m in client . models . list (): for action in m . supported_actions : if action == "createCachedContent" : print ( m . name ) break # Download file response = requests . get ( 'https://storage.googleapis.com/generativeai-downloads/data/a11.txt' ) pathlib . Path ( 'a11.txt' ) . write_text ( response . text ) # Upload file document = client . files . upload ( file = 'a11.txt' ) # Create cache model = 'gemini-1.5-flash-001' apollo_cache = client . caches . create ( model = model , config = { 'contents' : [ document ], 'system_instruction' : 'You are an expert at analyzing transcripts.' , }, ) # Generate response response = client . models . generate_content ( model = model , contents = 'Find a lighthearted moment from this transcript' , config = types . GenerateContentConfig ( cached_content = apollo_cache . name , ) ) JavaScript import { GoogleGenAI } from '@google/genai' ; const ai = new GoogleGenAI ({ apiKey : "GOOGLE_API_KEY" }); const filePath = path . join ( media , "a11.txt" ); const document = await ai . files . upload ({ file : filePath , config : { mimeType : "text/plain" }, }); console . log ( "Uploaded file name:" , document . name ); const modelName = "gemini-1.5-flash" ; const contents = [ createUserContent ( createPartFromUri ( document . uri , document . mimeType )), ]; const cache = await ai . caches . create ({ model : modelName , config : { contents : contents , systemInstruction : "You are an expert analyzing transcripts." , }, }); console . log ( "Cache created:" , cache ); const response = await ai . models . generateContent ({ model : modelName , contents : "Please summarize this transcript" , config : { cachedContent : cache . name }, }); console . log ( "Response text:" , response . text ); Count tokens Count the number of tokens in a request. Before Python import google.generativeai as genai model = genai . GenerativeModel ( 'gemini-1.5-flash' ) response = model . count_tokens ( 'The quick brown fox jumps over the lazy dog.' ) JavaScript import { GoogleGenerativeAI } from "@google/generative-ai" ; const genAI = new GoogleGenerativeAI ( "GOOGLE_API_KEY+); const model = genAI.getGenerativeModel({ model: " gemini - 1.5 - flash ", }); // Count tokens in a prompt without calling text generation. const countResult = await model.countTokens( " The quick brown fox jumps over the lazy dog . ", ); console.log(countResult.totalTokens); // 11 const generateResult = await model.generateContent( " The quick brown fox jumps over the lazy dog . " , ); // On the response for `generateContent`, use `usageMetadata` // to get separate input and output token counts // (`promptTokenCount` and `candidatesTokenCount`, respectively), // as well as the combined token count (`totalTokenCount`). console . log ( generateResult . response . usageMetadata ); // candidatesTokenCount and totalTokenCount depend on response, may vary // { promptTokenCount: 11, candidatesTokenCount: 124, totalTokenCount: 135 } After Python from google import genai client = genai . Client () response = client . models . count_tokens ( model = 'gemini-2.0-flash' , contents = 'The quick brown fox jumps over the lazy dog.' , ) JavaScript import { GoogleGenAI } from '@google/genai' ; const ai = new GoogleGenAI ({ apiKey : "GOOGLE_API_KEY" }); const prompt = "The quick brown fox jumps over the lazy dog." ; const countTokensResponse = await ai . models . countTokens ({ model : "gemini-2.0-flash" , contents : prompt , }); console . log ( countTokensResponse . totalTokens ); const generateResponse = await ai . models . generateContent ({ model : "gemini-2.0-flash" , contents : prompt , }); console . log ( generateResponse . usageMetadata ); Generate images Generate images: Before Python #pip install https://github.com/google-gemini/generative-ai-python@imagen import google.generativeai as genai imagen = genai . ImageGenerationModel ( "imagen-3.0-generate-001" ) gen_images = imagen . generate_images ( prompt = "Robot holding a red skateboard" , number_of_images = 1 , safety_filter_level = "block_low_and_above" , person_generation = "allow_adult" , aspect_ratio = "3:4" , ) After Python from google import genai client = genai . Client () gen_images = client . models . generate_images ( model = 'imagen-3.0-generate-001' , prompt = 'Robot holding a red skateboard' , config = types . GenerateImagesConfig ( number_of_images = 1 , safety_filter_level = "BLOCK_LOW_AND_ABOVE" , person_generation = "ALLOW_ADULT" , aspect_ratio = "3:4" , ) ) for n , image in enumerate ( gen_images . generated_images ): pathlib . Path ( f ' { n } .png' ) . write_bytes ( image . image . image_bytes ) Embed content Generate content embeddings. Before Python import google.generativeai as genai response = genai . embed_content ( model = 'models/text-embedding-004' , content = 'Hello world' ) JavaScript import { GoogleGenerativeAI } from "@google/generative-ai" ; const genAI = new GoogleGenerativeAI ( "GOOGLE_API_KEY" ); const model = genAI . getGenerativeModel ({ model : "text-embedding-004" , }); const result = await model . embedContent ( "Hello world!" ); console . log ( result . embedding ); After Python from google import genai client = genai . Client () response = client . models . embed_content ( model = 'text-embedding-004' , contents = 'Hello world' , ) JavaScript import { GoogleGenAI } from '@google/genai' ; const ai = new GoogleGenAI ({ apiKey : "GOOGLE_API_KEY" }); const text = "Hello World!" ; const result = await ai . models . embedContent ({ model : "text-embedding-004" , contents : text , config : { outputDimensionality : 10 }, }); console . log ( result . embeddings ); Tune a Model Create and use a tuned model. The new SDK simplifies tuning with client.tunings.tune , which launches the tuning job and polls until the job is complete. Before Python import google.generativeai as genai import random # create tuning model train_data = {} for i in range ( 1 , 6 ): key = f 'input { i } ' value = f 'output { i } ' train_data [ key ] = value name = f 'generate-num- { random . randint ( 0 , 10000 ) } ' operation = genai . create_tuned_model ( source_model = 'models/gemini-1.5-flash-001-tuning' , training_data = train_data , id = name , epoch_count = 5 , batch_size = 4 , learning_rate = 0.001 , ) # wait for tuning complete tuningProgress = operation . result () # generate content with the tuned model model = genai . GenerativeModel ( model_name = f 'tunedModels/ { name } ' ) response = model . generate_content ( '55' ) After Python from google import genai from google.genai import types client = genai . Client () # Check which models are available for tuning. for m in client . models . list (): for action in m . supported_actions : if action == "createTunedModel" : print ( m . name ) break # create tuning model training_dataset = types . TuningDataset ( examples = [ types . TuningExample ( text_input = f 'input { i } ' , output = f 'output { i } ' , ) for i in range ( 5 ) ], ) tuning_job = client . tunings . tune ( base_model = 'models/gemini-1.5-flash-001-tuning' , training_dataset = training_dataset , config = types . CreateTuningJobConfig ( epoch_count = 5 , batch_size = 4 , learning_rate = 0.001 , tuned_model_display_name = "test tuned model" ) ) # generate content with the tuned model response = client . models . generate_content ( model = tuning_job . tuned_model . model , contents = '55' , ) Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-09 UTC. \ No newline at end of file diff --git a/text_content/docs_models_0f41e493.txt b/text_content/docs_models_0f41e493.txt new file mode 100644 index 0000000000000000000000000000000000000000..80be4c01452cb04206b578ef8e08e2678a79f04b --- /dev/null +++ b/text_content/docs_models_0f41e493.txt @@ -0,0 +1,5 @@ +URL: https://ai.google.dev/gemini-api/docs/models#gemini-2.0-flash +Title: Gemini models  |  Gemini API  |  Google AI for Developers +================================================== + +Gemini models | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Gemini models 2.5 Pro spark Our most powerful thinking model with maximum response accuracy and state-of-the-art performance Input audio, images, video, and text, get text responses Tackle difficult problems, analyze large databases, and more Best for complex coding, reasoning, and multimodal understanding 2.5 Flash spark Our best model in terms of price-performance, offering well-rounded capabilities. Input audio, images, video, and text, and get text responses Model thinks as needed; or, you can configure a thinking budget Best for low latency, high volume tasks that require thinking 2.5 Flash-Lite experiment A Gemini 2.5 Flash model optimized for cost efficiency and low latency. Input audio, images, video, and text, and get text responses Most cost-efficient model supporting high throughput Best for real time, low latency use cases Note: Gemini 2.5 Pro and 2.5 Flash come with thinking on by default . If you're migrating from a non-thinking model such as 2.0 Pro or Flash, we recommend you to review the Thinking guide first. Model variants The Gemini API offers different models that are optimized for specific use cases. Here's a brief overview of Gemini variants that are available: Model variant Input(s) Output Optimized for Gemini 2.5 Pro gemini-2.5-pro Audio, images, videos, text, and PDF Text Enhanced thinking and reasoning, multimodal understanding, advanced coding, and more Gemini 2.5 Flash gemini-2.5-flash Audio, images, videos, and text Text Adaptive thinking, cost efficiency Gemini 2.5 Flash-Lite Preview gemini-2.5-flash-lite-preview-06-17 Text, image, video, audio Text Most cost-efficient model supporting high throughput Gemini 2.5 Flash Native Audio gemini-2.5-flash-preview-native-audio-dialog & gemini-2.5-flash-exp-native-audio-thinking-dialog Audio, videos, and text Text and audio, interleaved High quality, natural conversational audio outputs, with or without thinking Gemini 2.5 Flash Preview TTS gemini-2.5-flash-preview-tts Text Audio Low latency, controllable, single- and multi-speaker text-to-speech audio generation Gemini 2.5 Pro Preview TTS gemini-2.5-pro-preview-tts Text Audio Low latency, controllable, single- and multi-speaker text-to-speech audio generation Gemini 2.0 Flash gemini-2.0-flash Audio, images, videos, and text Text Next generation features, speed, and realtime streaming. Gemini 2.0 Flash Preview Image Generation gemini-2.0-flash-preview-image-generation Audio, images, videos, and text Text, images Conversational image generation and editing Gemini 2.0 Flash-Lite gemini-2.0-flash-lite Audio, images, videos, and text Text Cost efficiency and low latency Gemini 1.5 Flash gemini-1.5-flash Audio, images, videos, and text Text Fast and versatile performance across a diverse variety of tasks Gemini 1.5 Flash-8B gemini-1.5-flash-8b Audio, images, videos, and text Text High volume and lower intelligence tasks Gemini 1.5 Pro gemini-1.5-pro Audio, images, videos, and text Text Complex reasoning tasks requiring more intelligence Gemini Embedding gemini-embedding-exp Text Text embeddings Measuring the relatedness of text strings Imagen 4 imagen-4.0-generate-preview-06-06 imagen-4.0-ultra-generate-preview-06-06 Text Images Our most up-to-date image generation model Imagen 3 imagen-3.0-generate-002 Text Images High quality image generation model Veo 2 veo-2.0-generate-001 Text, images Video High quality video generation Gemini 2.5 Flash Live gemini-live-2.5-flash-preview Audio, video, and text Text, audio Low-latency bidirectional voice and video interactions Gemini 2.0 Flash Live gemini-2.0-flash-live-001 Audio, video, and text Text, audio Low-latency bidirectional voice and video interactions You can view the rate limits for each model on the rate limits page . Gemini 2.5 Pro Gemini 2.5 Pro is our state-of-the-art thinking model, capable of reasoning over complex problems in code, math, and STEM, as well as analyzing large datasets, codebases, and documents using long context. Try in Google AI Studio Model details Property Description id_card Model code gemini-2.5-pro save Supported data types Inputs Audio, images, video, text, and PDF Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 65,536 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Search grounding Supported Image generation Not supported Audio generation Not supported Live API Not supported Thinking Supported Batch API Supported 123 Versions Read the model version patterns for more details. Stable: gemini-2.5-pro Preview: gemini-2.5-pro-preview-06-05 Preview: gemini-2.5-pro-preview-05-06 Preview: gemini-2.5-pro-preview-03-25 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Our best model in terms of price-performance, offering well-rounded capabilities. 2.5 Flash is best for large scale processing, low-latency, high volume tasks that require thinking, and agentic use cases. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash save Supported data types Inputs Text, images, video, audio Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 65,536 handyman Capabilities Audio generation Not supported Caching Supported Code execution Supported Function calling Supported Image generation Not supported Search grounding Supported Structured outputs Supported Thinking Supported Tuning Not supported Batch API Supported 123 Versions Read the model version patterns for more details. Stable: gemini-2.5-flash Preview: gemini-2.5-flash-preview-05-20 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash-Lite Preview A Gemini 2.5 Flash model optimized for cost efficiency and low latency. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-lite-preview-06-17 save Supported data types Inputs Text, images, video, and audio Output Text token_auto Token limits [*] Input token limit 1,000,000 Output token limit 64,000 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported URL Context Supported Search grounding Supported Image generation Not supported Audio generation Not supported Live API Not supported Thinking Supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.5-flash-lite-preview-06-17 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Native Audio Our native audio dialog models, with and without thinking, available through the Live API . These models provide interactive and unstructured conversational experiences, with style and control prompting. Try native audio in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-preview-native-audio-dialog & models/gemini-2.5-flash-exp-native-audio-thinking-dialog save Supported data types Inputs Audio, video, text Output Audio and text token_auto Token limits [*] Input token limit 128,000 Output token limit 8,000 handyman Capabilities Audio generation Supported Caching Not supported Code execution Not supported Function calling Supported Image generation Not supported Search grounding Supported Structured outputs Not supported Thinking Supported Tuning Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.5-flash-preview-05-20 Experimental: gemini-2.5-flash-exp-native-audio-thinking-dialog calendar_month Latest update May 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Preview Text-to-Speech Gemini 2.5 Flash Preview TTS is our price-performant text-to-speech model, delivering high control and transparency for structured workflows like podcast generation, audiobooks, customer support, and more. Gemini 2.5 Flash rate limits are more restricted since it is an experimental / preview model. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-preview-tts save Supported data types Inputs Text Output Audio token_auto Token limits [*] Input token limit 8,000 Output token limit 16,000 handyman Capabilities Structured outputs Not supported Caching Not supported Tuning Not supported Function calling Not supported Code execution Not supported Search Not supported Audio generation Supported Live API Not supported Thinking Not supported 123 Versions Read the model version patterns for more details. gemini-2.5-flash-preview-tts calendar_month Latest update May 2025 Gemini 2.5 Pro Preview Text-to-Speech Gemini 2.5 Pro Preview TTS is our most powerful text-to-speech model, delivering high control and transparency for structured workflows like podcast generation, audiobooks, customer support, and more. Gemini 2.5 Pro rate limits are more restricted since it is an experimental / preview model. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-pro-preview-tts save Supported data types Inputs Text Output Audio token_auto Token limits [*] Input token limit 8,000 Output token limit 16,000 handyman Capabilities Structured outputs Not supported Caching Not supported Tuning Not supported Function calling Not supported Code execution Not supported Search Not supported Audio generation Supported Live API Not supported Thinking Not supported 123 Versions Read the model version patterns for more details. gemini-2.5-pro-preview-tts calendar_month Latest update May 2025 Gemini 2.0 Flash Gemini 2.0 Flash delivers next-gen features and improved capabilities, including superior speed, native tool use, and a 1M token context window. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Not supported Live API Supported Thinking Experimental Batch API Supported 123 Versions Read the model version patterns for more details. Latest: gemini-2.0-flash Stable: gemini-2.0-flash-001 Experimental: gemini-2.0-flash-exp calendar_month Latest update February 2025 cognition_2 Knowledge cutoff August 2024 Gemini 2.0 Flash Preview Image Generation Gemini 2.0 Flash Preview Image Generation delivers improved image generation features, including generating and editing images conversationally. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-preview-image-generation save Supported data types Inputs Audio, images, video, and text Output Text and images token_auto Token limits [*] Input token limit 32,000 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Not supported Code execution Not Supported Search Not Supported Image generation Supported Audio generation Not supported Live API Not Supported Thinking Not Supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.0-flash-preview-image-generation gemini-2.0-flash-preview-image-generation is not currently supported in a number of countries in Europe, Middle East & Africa calendar_month Latest update May 2025 cognition_2 Knowledge cutoff August 2024 Gemini 2.0 Flash-Lite A Gemini 2.0 Flash model optimized for cost efficiency and low latency. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-lite save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Not supported Search Not supported Image generation Not supported Audio generation Not supported Live API Not supported Batch API Supported 123 Versions Read the model version patterns for more details. Latest: gemini-2.0-flash-lite Stable: gemini-2.0-flash-lite-001 calendar_month Latest update February 2025 cognition_2 Knowledge cutoff August 2024 Gemini 1.5 Flash Gemini 1.5 Flash is a fast and versatile multimodal model for scaling across diverse tasks. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-flash save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 3,600 Maximum video length 1 hour Maximum audio length Approximately 9.5 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-flash-latest Latest stable: gemini-1.5-flash Stable: gemini-1.5-flash-001 gemini-1.5-flash-002 calendar_month Latest update September 2024 Gemini 1.5 Flash-8B Gemini 1.5 Flash-8B is a small model designed for lower intelligence tasks. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-flash-8b save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 3,600 Maximum video length 1 hour Maximum audio length Approximately 9.5 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-flash-8b-latest Latest stable: gemini-1.5-flash-8b Stable: gemini-1.5-flash-8b-001 calendar_month Latest update October 2024 Gemini 1.5 Pro Try Gemini 2.5 Pro Preview , our most advanced Gemini model to date. Gemini 1.5 Pro is a mid-size multimodal model that is optimized for a wide-range of reasoning tasks. 1.5 Pro can process large amounts of data at once, including 2 hours of video, 19 hours of audio, codebases with 60,000 lines of code, or 2,000 pages of text. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-pro save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 2,097,152 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 7,200 Maximum video length 2 hours Maximum audio length Approximately 19 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-pro-latest Latest stable: gemini-1.5-pro Stable: gemini-1.5-pro-001 gemini-1.5-pro-002 calendar_month Latest update September 2024 Imagen 4 Imagen 4 is our latest image model, capable of generating highly detailed images with rich lighting, significantly better text rendering, and higher resolution output than previous models. Model details Property Description id_card Model code Gemini API imagen-4.0-generate-preview-06-06 imagen-4.0-ultra-generate-preview-06-06 save Supported data types Input Text Output Images token_auto Token limits [*] Input token limit 480 tokens (text) Output images 1 (Ultra) 1 to 4 (Standard) calendar_month Latest update June 2025 Imagen 3 Imagen 3 is our highest quality text-to-image model, capable of generating images with even better detail, richer lighting and fewer distracting artifacts than our previous models. Model details Property Description id_card Model code Gemini API imagen-3.0-generate-002 save Supported data types Input Text Output Images token_auto Token limits [*] Input token limit N/A Output images Up to 4 calendar_month Latest update February 2025 Veo 2 Veo 2 is our high quality text- and image-to-video model, capable of generating detailed videos, capturing the artistic nuance in your prompts. Model details Property Description id_card Model code Gemini API veo-2.0-generate-001 save Supported data types Input Text, image Output Video token_auto Limits Text input N/A Image input Any image resolution and aspect ratio up to 20MB file size Output video Up to 2 calendar_month Latest update April 2025 Gemini 2.5 Flash Live The Gemini 2.5 Flash Live model works with the Live API to enable low-latency bidirectional voice and video interactions with Gemini. The model can process text, audio, and video input, and it can provide text and audio output. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-live-2.5-flash-preview save Supported data types Inputs Audio, video, and text Output Text, and audio token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Supported Thinking Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-live-2.5-flash-preview calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.0 Flash Live The Gemini 2.0 Flash Live model works with the Live API to enable low-latency bidirectional voice and video interactions with Gemini. The model can process text, audio, and video input, and it can provide text and audio output. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-live-001 save Supported data types Inputs Audio, video, and text Output Text, and audio token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Supported Thinking Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.0-flash-live-001 calendar_month Latest update April 2025 cognition_2 Knowledge cutoff August 2024 Gemini Embedding Experimental Gemini embedding achieves a SOTA performance across many key dimensions including code, multi-lingual, and retrieval. Gemini Embedding rate limits are more restricted since it is an experimental model. Model details Property Description id_card Model code Gemini API gemini-embedding-exp-03-07 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 8,192 Output dimension size Elastic, supports: 3072, 1536, or 768 calendar_month Latest update March 2025 Text Embedding and Embedding Text Embedding Try our new experimental Gemini embedding model which achieves state-of-the-art performance. Text embeddings are used to measure the relatedness of strings and are widely used in many AI applications. text-embedding-004 achieves a stronger retrieval performance and outperforms existing models with comparable dimensions, on the standard MTEB embedding benchmarks. Model details Property Description id_card Model code Gemini API models/text-embedding-004 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 2,048 Output dimension size 768 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Not supported calendar_month Latest update April 2024 Embedding Note: Text Embedding is the newer version of the Embedding model. If you're creating a new project, use Text Embedding. You can use the Embedding model to generate text embeddings for input text. The Embedding model is optimized for creating embeddings with 768 dimensions for text of up to 2,048 tokens. Embedding model details Property Description id_card Model code models/embedding-001 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 2,048 Output dimension size 768 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Not supported calendar_month Latest update December 2023 AQA You can use the AQA model to perform Attributed Question-Answering (AQA)–related tasks over a document, corpus, or a set of passages. The AQA model returns answers to questions that are grounded in provided sources, along with estimating answerable probability. Model details Property Description id_card Model code models/aqa save Supported data types Input Text Output Text language Supported language English token_auto Token limits [*] Input token limit 7,168 Output token limit 1,024 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Supported calendar_month Latest update December 2023 See the examples to explore the capabilities of these model variations. [*] A token is equivalent to about 4 characters for Gemini models. 100 tokens are about 60-80 English words. Model version name patterns Gemini models are available in either stable , preview , or experimental versions. In your code, you can use one of the following model name formats to specify which model and version you want to use. Latest stable Points to the most recent stable version released for the specified model generation and variation. To specify the latest stable version, use the following pattern: -- . For example, gemini-2.0-flash . Stable Points to a specific stable model. Stable models usually don't change. Most production apps should use a specific stable model. To specify a stable version, use the following pattern: --- . For example, gemini-2.0-flash-001 . Preview Points to a preview model which may not be suitable for production use, come with more restrictive rate limits, but may have billing enabled. To specify a preview version, use the following pattern: --- . For example, gemini-2.5-pro-preview-06-05 . Experimental Points to an experimental model which may not be suitable for production use and come with more restrictive rate limits. We release experimental models to gather feedback and get our latest updates into the hands of developers quickly. To specify an experimental version, use the following pattern: --- . For example, gemini-2.0-pro-exp-02-05 . Experimental models In addition to stable models, the Gemini API offers experimental models which may not be suitable for production use and come with more restrictive rate limits. We release experimental models to gather feedback, get our latest updates into the hands of developers quickly, and highlight the pace of innovation happening at Google. What we learn from experimental launches informs how we release models more widely. An experimental model can be swapped for another without prior notice. We don't guarantee that an experimental model will become a stable model in the future. Previous experimental models As new versions or stable releases become available, we remove and replace experimental models. You can find the previous experimental models we released in the following section along with the replacement version: Model code Base model Replacement version gemini-2.5-flash-preview-04-17 Gemini 2.5 Flash gemini-2.5-flash-preview-05-20 gemini-2.0-flash-exp-image-generation Gemini 2.0 Flash gemini-2.0-flash-preview-image-generation gemini-2.5-pro-preview-06-05 Gemini 2.5 Pro gemini-2.5-pro gemini-2.5-pro-preview-05-06 Gemini 2.5 Pro gemini-2.5-pro gemini-2.5-pro-preview-03-25 Gemini 2.5 Pro gemini-2.5-pro gemini-2.0-flash-thinking-exp-01-21 Gemini 2.5 Flash gemini-2.5-flash-preview-04-17 gemini-2.0-pro-exp-02-05 Gemini 2.0 Pro Experimental gemini-2.5-pro-preview-03-25 gemini-2.0-flash-exp Gemini 2.0 Flash gemini-2.0-flash gemini-exp-1206 Gemini 2.0 Pro gemini-2.0-pro-exp-02-05 gemini-2.0-flash-thinking-exp-1219 Gemini 2.0 Flash Thinking gemini-2.0-flash-thinking-exp-01-21 gemini-exp-1121 Gemini gemini-exp-1206 gemini-exp-1114 Gemini gemini-exp-1206 gemini-1.5-pro-exp-0827 Gemini 1.5 Pro gemini-exp-1206 gemini-1.5-pro-exp-0801 Gemini 1.5 Pro gemini-exp-1206 gemini-1.5-flash-8b-exp-0924 Gemini 1.5 Flash-8B gemini-1.5-flash-8b gemini-1.5-flash-8b-exp-0827 Gemini 1.5 Flash-8B gemini-1.5-flash-8b Supported languages Gemini models are trained to work with the following languages: Arabic ( ar ) Bengali ( bn ) Bulgarian ( bg ) Chinese simplified and traditional ( zh ) Croatian ( hr ) Czech ( cs ) Danish ( da ) Dutch ( nl ) English ( en ) Estonian ( et ) Finnish ( fi ) French ( fr ) German ( de ) Greek ( el ) Hebrew ( iw ) Hindi ( hi ) Hungarian ( hu ) Indonesian ( id ) Italian ( it ) Japanese ( ja ) Korean ( ko ) Latvian ( lv ) Lithuanian ( lt ) Norwegian ( no ) Polish ( pl ) Portuguese ( pt ) Romanian ( ro ) Russian ( ru ) Serbian ( sr ) Slovak ( sk ) Slovenian ( sl ) Spanish ( es ) Swahili ( sw ) Swedish ( sv ) Thai ( th ) Turkish ( tr ) Ukrainian ( uk ) Vietnamese ( vi ) Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-07 UTC. \ No newline at end of file diff --git a/text_content/docs_models_19d47b8d.txt b/text_content/docs_models_19d47b8d.txt new file mode 100644 index 0000000000000000000000000000000000000000..1d2464b66539b602bc19e187f19f6b39ca5148d0 --- /dev/null +++ b/text_content/docs_models_19d47b8d.txt @@ -0,0 +1,5 @@ +URL: https://ai.google.dev/gemini-api/docs/models#gemini-2.5-pro +Title: Gemini models  |  Gemini API  |  Google AI for Developers +================================================== + +Gemini models | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Gemini models 2.5 Pro spark Our most powerful thinking model with maximum response accuracy and state-of-the-art performance Input audio, images, video, and text, get text responses Tackle difficult problems, analyze large databases, and more Best for complex coding, reasoning, and multimodal understanding 2.5 Flash spark Our best model in terms of price-performance, offering well-rounded capabilities. Input audio, images, video, and text, and get text responses Model thinks as needed; or, you can configure a thinking budget Best for low latency, high volume tasks that require thinking 2.5 Flash-Lite experiment A Gemini 2.5 Flash model optimized for cost efficiency and low latency. Input audio, images, video, and text, and get text responses Most cost-efficient model supporting high throughput Best for real time, low latency use cases Note: Gemini 2.5 Pro and 2.5 Flash come with thinking on by default . If you're migrating from a non-thinking model such as 2.0 Pro or Flash, we recommend you to review the Thinking guide first. Model variants The Gemini API offers different models that are optimized for specific use cases. Here's a brief overview of Gemini variants that are available: Model variant Input(s) Output Optimized for Gemini 2.5 Pro gemini-2.5-pro Audio, images, videos, text, and PDF Text Enhanced thinking and reasoning, multimodal understanding, advanced coding, and more Gemini 2.5 Flash gemini-2.5-flash Audio, images, videos, and text Text Adaptive thinking, cost efficiency Gemini 2.5 Flash-Lite Preview gemini-2.5-flash-lite-preview-06-17 Text, image, video, audio Text Most cost-efficient model supporting high throughput Gemini 2.5 Flash Native Audio gemini-2.5-flash-preview-native-audio-dialog & gemini-2.5-flash-exp-native-audio-thinking-dialog Audio, videos, and text Text and audio, interleaved High quality, natural conversational audio outputs, with or without thinking Gemini 2.5 Flash Preview TTS gemini-2.5-flash-preview-tts Text Audio Low latency, controllable, single- and multi-speaker text-to-speech audio generation Gemini 2.5 Pro Preview TTS gemini-2.5-pro-preview-tts Text Audio Low latency, controllable, single- and multi-speaker text-to-speech audio generation Gemini 2.0 Flash gemini-2.0-flash Audio, images, videos, and text Text Next generation features, speed, and realtime streaming. Gemini 2.0 Flash Preview Image Generation gemini-2.0-flash-preview-image-generation Audio, images, videos, and text Text, images Conversational image generation and editing Gemini 2.0 Flash-Lite gemini-2.0-flash-lite Audio, images, videos, and text Text Cost efficiency and low latency Gemini 1.5 Flash gemini-1.5-flash Audio, images, videos, and text Text Fast and versatile performance across a diverse variety of tasks Gemini 1.5 Flash-8B gemini-1.5-flash-8b Audio, images, videos, and text Text High volume and lower intelligence tasks Gemini 1.5 Pro gemini-1.5-pro Audio, images, videos, and text Text Complex reasoning tasks requiring more intelligence Gemini Embedding gemini-embedding-exp Text Text embeddings Measuring the relatedness of text strings Imagen 4 imagen-4.0-generate-preview-06-06 imagen-4.0-ultra-generate-preview-06-06 Text Images Our most up-to-date image generation model Imagen 3 imagen-3.0-generate-002 Text Images High quality image generation model Veo 2 veo-2.0-generate-001 Text, images Video High quality video generation Gemini 2.5 Flash Live gemini-live-2.5-flash-preview Audio, video, and text Text, audio Low-latency bidirectional voice and video interactions Gemini 2.0 Flash Live gemini-2.0-flash-live-001 Audio, video, and text Text, audio Low-latency bidirectional voice and video interactions You can view the rate limits for each model on the rate limits page . Gemini 2.5 Pro Gemini 2.5 Pro is our state-of-the-art thinking model, capable of reasoning over complex problems in code, math, and STEM, as well as analyzing large datasets, codebases, and documents using long context. Try in Google AI Studio Model details Property Description id_card Model code gemini-2.5-pro save Supported data types Inputs Audio, images, video, text, and PDF Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 65,536 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Search grounding Supported Image generation Not supported Audio generation Not supported Live API Not supported Thinking Supported Batch API Supported 123 Versions Read the model version patterns for more details. Stable: gemini-2.5-pro Preview: gemini-2.5-pro-preview-06-05 Preview: gemini-2.5-pro-preview-05-06 Preview: gemini-2.5-pro-preview-03-25 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Our best model in terms of price-performance, offering well-rounded capabilities. 2.5 Flash is best for large scale processing, low-latency, high volume tasks that require thinking, and agentic use cases. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash save Supported data types Inputs Text, images, video, audio Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 65,536 handyman Capabilities Audio generation Not supported Caching Supported Code execution Supported Function calling Supported Image generation Not supported Search grounding Supported Structured outputs Supported Thinking Supported Tuning Not supported Batch API Supported 123 Versions Read the model version patterns for more details. Stable: gemini-2.5-flash Preview: gemini-2.5-flash-preview-05-20 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash-Lite Preview A Gemini 2.5 Flash model optimized for cost efficiency and low latency. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-lite-preview-06-17 save Supported data types Inputs Text, images, video, and audio Output Text token_auto Token limits [*] Input token limit 1,000,000 Output token limit 64,000 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported URL Context Supported Search grounding Supported Image generation Not supported Audio generation Not supported Live API Not supported Thinking Supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.5-flash-lite-preview-06-17 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Native Audio Our native audio dialog models, with and without thinking, available through the Live API . These models provide interactive and unstructured conversational experiences, with style and control prompting. Try native audio in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-preview-native-audio-dialog & models/gemini-2.5-flash-exp-native-audio-thinking-dialog save Supported data types Inputs Audio, video, text Output Audio and text token_auto Token limits [*] Input token limit 128,000 Output token limit 8,000 handyman Capabilities Audio generation Supported Caching Not supported Code execution Not supported Function calling Supported Image generation Not supported Search grounding Supported Structured outputs Not supported Thinking Supported Tuning Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.5-flash-preview-05-20 Experimental: gemini-2.5-flash-exp-native-audio-thinking-dialog calendar_month Latest update May 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Preview Text-to-Speech Gemini 2.5 Flash Preview TTS is our price-performant text-to-speech model, delivering high control and transparency for structured workflows like podcast generation, audiobooks, customer support, and more. Gemini 2.5 Flash rate limits are more restricted since it is an experimental / preview model. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-preview-tts save Supported data types Inputs Text Output Audio token_auto Token limits [*] Input token limit 8,000 Output token limit 16,000 handyman Capabilities Structured outputs Not supported Caching Not supported Tuning Not supported Function calling Not supported Code execution Not supported Search Not supported Audio generation Supported Live API Not supported Thinking Not supported 123 Versions Read the model version patterns for more details. gemini-2.5-flash-preview-tts calendar_month Latest update May 2025 Gemini 2.5 Pro Preview Text-to-Speech Gemini 2.5 Pro Preview TTS is our most powerful text-to-speech model, delivering high control and transparency for structured workflows like podcast generation, audiobooks, customer support, and more. Gemini 2.5 Pro rate limits are more restricted since it is an experimental / preview model. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-pro-preview-tts save Supported data types Inputs Text Output Audio token_auto Token limits [*] Input token limit 8,000 Output token limit 16,000 handyman Capabilities Structured outputs Not supported Caching Not supported Tuning Not supported Function calling Not supported Code execution Not supported Search Not supported Audio generation Supported Live API Not supported Thinking Not supported 123 Versions Read the model version patterns for more details. gemini-2.5-pro-preview-tts calendar_month Latest update May 2025 Gemini 2.0 Flash Gemini 2.0 Flash delivers next-gen features and improved capabilities, including superior speed, native tool use, and a 1M token context window. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Not supported Live API Supported Thinking Experimental Batch API Supported 123 Versions Read the model version patterns for more details. Latest: gemini-2.0-flash Stable: gemini-2.0-flash-001 Experimental: gemini-2.0-flash-exp calendar_month Latest update February 2025 cognition_2 Knowledge cutoff August 2024 Gemini 2.0 Flash Preview Image Generation Gemini 2.0 Flash Preview Image Generation delivers improved image generation features, including generating and editing images conversationally. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-preview-image-generation save Supported data types Inputs Audio, images, video, and text Output Text and images token_auto Token limits [*] Input token limit 32,000 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Not supported Code execution Not Supported Search Not Supported Image generation Supported Audio generation Not supported Live API Not Supported Thinking Not Supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.0-flash-preview-image-generation gemini-2.0-flash-preview-image-generation is not currently supported in a number of countries in Europe, Middle East & Africa calendar_month Latest update May 2025 cognition_2 Knowledge cutoff August 2024 Gemini 2.0 Flash-Lite A Gemini 2.0 Flash model optimized for cost efficiency and low latency. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-lite save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Not supported Search Not supported Image generation Not supported Audio generation Not supported Live API Not supported Batch API Supported 123 Versions Read the model version patterns for more details. Latest: gemini-2.0-flash-lite Stable: gemini-2.0-flash-lite-001 calendar_month Latest update February 2025 cognition_2 Knowledge cutoff August 2024 Gemini 1.5 Flash Gemini 1.5 Flash is a fast and versatile multimodal model for scaling across diverse tasks. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-flash save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 3,600 Maximum video length 1 hour Maximum audio length Approximately 9.5 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-flash-latest Latest stable: gemini-1.5-flash Stable: gemini-1.5-flash-001 gemini-1.5-flash-002 calendar_month Latest update September 2024 Gemini 1.5 Flash-8B Gemini 1.5 Flash-8B is a small model designed for lower intelligence tasks. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-flash-8b save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 3,600 Maximum video length 1 hour Maximum audio length Approximately 9.5 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-flash-8b-latest Latest stable: gemini-1.5-flash-8b Stable: gemini-1.5-flash-8b-001 calendar_month Latest update October 2024 Gemini 1.5 Pro Try Gemini 2.5 Pro Preview , our most advanced Gemini model to date. Gemini 1.5 Pro is a mid-size multimodal model that is optimized for a wide-range of reasoning tasks. 1.5 Pro can process large amounts of data at once, including 2 hours of video, 19 hours of audio, codebases with 60,000 lines of code, or 2,000 pages of text. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-pro save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 2,097,152 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 7,200 Maximum video length 2 hours Maximum audio length Approximately 19 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-pro-latest Latest stable: gemini-1.5-pro Stable: gemini-1.5-pro-001 gemini-1.5-pro-002 calendar_month Latest update September 2024 Imagen 4 Imagen 4 is our latest image model, capable of generating highly detailed images with rich lighting, significantly better text rendering, and higher resolution output than previous models. Model details Property Description id_card Model code Gemini API imagen-4.0-generate-preview-06-06 imagen-4.0-ultra-generate-preview-06-06 save Supported data types Input Text Output Images token_auto Token limits [*] Input token limit 480 tokens (text) Output images 1 (Ultra) 1 to 4 (Standard) calendar_month Latest update June 2025 Imagen 3 Imagen 3 is our highest quality text-to-image model, capable of generating images with even better detail, richer lighting and fewer distracting artifacts than our previous models. Model details Property Description id_card Model code Gemini API imagen-3.0-generate-002 save Supported data types Input Text Output Images token_auto Token limits [*] Input token limit N/A Output images Up to 4 calendar_month Latest update February 2025 Veo 2 Veo 2 is our high quality text- and image-to-video model, capable of generating detailed videos, capturing the artistic nuance in your prompts. Model details Property Description id_card Model code Gemini API veo-2.0-generate-001 save Supported data types Input Text, image Output Video token_auto Limits Text input N/A Image input Any image resolution and aspect ratio up to 20MB file size Output video Up to 2 calendar_month Latest update April 2025 Gemini 2.5 Flash Live The Gemini 2.5 Flash Live model works with the Live API to enable low-latency bidirectional voice and video interactions with Gemini. The model can process text, audio, and video input, and it can provide text and audio output. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-live-2.5-flash-preview save Supported data types Inputs Audio, video, and text Output Text, and audio token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Supported Thinking Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-live-2.5-flash-preview calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.0 Flash Live The Gemini 2.0 Flash Live model works with the Live API to enable low-latency bidirectional voice and video interactions with Gemini. The model can process text, audio, and video input, and it can provide text and audio output. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-live-001 save Supported data types Inputs Audio, video, and text Output Text, and audio token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Supported Thinking Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.0-flash-live-001 calendar_month Latest update April 2025 cognition_2 Knowledge cutoff August 2024 Gemini Embedding Experimental Gemini embedding achieves a SOTA performance across many key dimensions including code, multi-lingual, and retrieval. Gemini Embedding rate limits are more restricted since it is an experimental model. Model details Property Description id_card Model code Gemini API gemini-embedding-exp-03-07 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 8,192 Output dimension size Elastic, supports: 3072, 1536, or 768 calendar_month Latest update March 2025 Text Embedding and Embedding Text Embedding Try our new experimental Gemini embedding model which achieves state-of-the-art performance. Text embeddings are used to measure the relatedness of strings and are widely used in many AI applications. text-embedding-004 achieves a stronger retrieval performance and outperforms existing models with comparable dimensions, on the standard MTEB embedding benchmarks. Model details Property Description id_card Model code Gemini API models/text-embedding-004 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 2,048 Output dimension size 768 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Not supported calendar_month Latest update April 2024 Embedding Note: Text Embedding is the newer version of the Embedding model. If you're creating a new project, use Text Embedding. You can use the Embedding model to generate text embeddings for input text. The Embedding model is optimized for creating embeddings with 768 dimensions for text of up to 2,048 tokens. Embedding model details Property Description id_card Model code models/embedding-001 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 2,048 Output dimension size 768 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Not supported calendar_month Latest update December 2023 AQA You can use the AQA model to perform Attributed Question-Answering (AQA)–related tasks over a document, corpus, or a set of passages. The AQA model returns answers to questions that are grounded in provided sources, along with estimating answerable probability. Model details Property Description id_card Model code models/aqa save Supported data types Input Text Output Text language Supported language English token_auto Token limits [*] Input token limit 7,168 Output token limit 1,024 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Supported calendar_month Latest update December 2023 See the examples to explore the capabilities of these model variations. [*] A token is equivalent to about 4 characters for Gemini models. 100 tokens are about 60-80 English words. Model version name patterns Gemini models are available in either stable , preview , or experimental versions. In your code, you can use one of the following model name formats to specify which model and version you want to use. Latest stable Points to the most recent stable version released for the specified model generation and variation. To specify the latest stable version, use the following pattern: -- . For example, gemini-2.0-flash . Stable Points to a specific stable model. Stable models usually don't change. Most production apps should use a specific stable model. To specify a stable version, use the following pattern: --- . For example, gemini-2.0-flash-001 . Preview Points to a preview model which may not be suitable for production use, come with more restrictive rate limits, but may have billing enabled. To specify a preview version, use the following pattern: --- . For example, gemini-2.5-pro-preview-06-05 . Experimental Points to an experimental model which may not be suitable for production use and come with more restrictive rate limits. We release experimental models to gather feedback and get our latest updates into the hands of developers quickly. To specify an experimental version, use the following pattern: --- . For example, gemini-2.0-pro-exp-02-05 . Experimental models In addition to stable models, the Gemini API offers experimental models which may not be suitable for production use and come with more restrictive rate limits. We release experimental models to gather feedback, get our latest updates into the hands of developers quickly, and highlight the pace of innovation happening at Google. What we learn from experimental launches informs how we release models more widely. An experimental model can be swapped for another without prior notice. We don't guarantee that an experimental model will become a stable model in the future. Previous experimental models As new versions or stable releases become available, we remove and replace experimental models. You can find the previous experimental models we released in the following section along with the replacement version: Model code Base model Replacement version gemini-2.5-flash-preview-04-17 Gemini 2.5 Flash gemini-2.5-flash-preview-05-20 gemini-2.0-flash-exp-image-generation Gemini 2.0 Flash gemini-2.0-flash-preview-image-generation gemini-2.5-pro-preview-06-05 Gemini 2.5 Pro gemini-2.5-pro gemini-2.5-pro-preview-05-06 Gemini 2.5 Pro gemini-2.5-pro gemini-2.5-pro-preview-03-25 Gemini 2.5 Pro gemini-2.5-pro gemini-2.0-flash-thinking-exp-01-21 Gemini 2.5 Flash gemini-2.5-flash-preview-04-17 gemini-2.0-pro-exp-02-05 Gemini 2.0 Pro Experimental gemini-2.5-pro-preview-03-25 gemini-2.0-flash-exp Gemini 2.0 Flash gemini-2.0-flash gemini-exp-1206 Gemini 2.0 Pro gemini-2.0-pro-exp-02-05 gemini-2.0-flash-thinking-exp-1219 Gemini 2.0 Flash Thinking gemini-2.0-flash-thinking-exp-01-21 gemini-exp-1121 Gemini gemini-exp-1206 gemini-exp-1114 Gemini gemini-exp-1206 gemini-1.5-pro-exp-0827 Gemini 1.5 Pro gemini-exp-1206 gemini-1.5-pro-exp-0801 Gemini 1.5 Pro gemini-exp-1206 gemini-1.5-flash-8b-exp-0924 Gemini 1.5 Flash-8B gemini-1.5-flash-8b gemini-1.5-flash-8b-exp-0827 Gemini 1.5 Flash-8B gemini-1.5-flash-8b Supported languages Gemini models are trained to work with the following languages: Arabic ( ar ) Bengali ( bn ) Bulgarian ( bg ) Chinese simplified and traditional ( zh ) Croatian ( hr ) Czech ( cs ) Danish ( da ) Dutch ( nl ) English ( en ) Estonian ( et ) Finnish ( fi ) French ( fr ) German ( de ) Greek ( el ) Hebrew ( iw ) Hindi ( hi ) Hungarian ( hu ) Indonesian ( id ) Italian ( it ) Japanese ( ja ) Korean ( ko ) Latvian ( lv ) Lithuanian ( lt ) Norwegian ( no ) Polish ( pl ) Portuguese ( pt ) Romanian ( ro ) Russian ( ru ) Serbian ( sr ) Slovak ( sk ) Slovenian ( sl ) Spanish ( es ) Swahili ( sw ) Swedish ( sv ) Thai ( th ) Turkish ( tr ) Ukrainian ( uk ) Vietnamese ( vi ) Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-07 UTC. \ No newline at end of file diff --git a/text_content/docs_models_25c479c1.txt b/text_content/docs_models_25c479c1.txt new file mode 100644 index 0000000000000000000000000000000000000000..c8148847205b7ca01ba3e31dd93a0361ea7d26bf --- /dev/null +++ b/text_content/docs_models_25c479c1.txt @@ -0,0 +1,5 @@ +URL: https://ai.google.dev/gemini-api/docs/models#gemini-2.0-flash-preview-image-generation +Title: Gemini models  |  Gemini API  |  Google AI for Developers +================================================== + +Gemini models | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Gemini models 2.5 Pro spark Our most powerful thinking model with maximum response accuracy and state-of-the-art performance Input audio, images, video, and text, get text responses Tackle difficult problems, analyze large databases, and more Best for complex coding, reasoning, and multimodal understanding 2.5 Flash spark Our best model in terms of price-performance, offering well-rounded capabilities. Input audio, images, video, and text, and get text responses Model thinks as needed; or, you can configure a thinking budget Best for low latency, high volume tasks that require thinking 2.5 Flash-Lite experiment A Gemini 2.5 Flash model optimized for cost efficiency and low latency. Input audio, images, video, and text, and get text responses Most cost-efficient model supporting high throughput Best for real time, low latency use cases Note: Gemini 2.5 Pro and 2.5 Flash come with thinking on by default . If you're migrating from a non-thinking model such as 2.0 Pro or Flash, we recommend you to review the Thinking guide first. Model variants The Gemini API offers different models that are optimized for specific use cases. Here's a brief overview of Gemini variants that are available: Model variant Input(s) Output Optimized for Gemini 2.5 Pro gemini-2.5-pro Audio, images, videos, text, and PDF Text Enhanced thinking and reasoning, multimodal understanding, advanced coding, and more Gemini 2.5 Flash gemini-2.5-flash Audio, images, videos, and text Text Adaptive thinking, cost efficiency Gemini 2.5 Flash-Lite Preview gemini-2.5-flash-lite-preview-06-17 Text, image, video, audio Text Most cost-efficient model supporting high throughput Gemini 2.5 Flash Native Audio gemini-2.5-flash-preview-native-audio-dialog & gemini-2.5-flash-exp-native-audio-thinking-dialog Audio, videos, and text Text and audio, interleaved High quality, natural conversational audio outputs, with or without thinking Gemini 2.5 Flash Preview TTS gemini-2.5-flash-preview-tts Text Audio Low latency, controllable, single- and multi-speaker text-to-speech audio generation Gemini 2.5 Pro Preview TTS gemini-2.5-pro-preview-tts Text Audio Low latency, controllable, single- and multi-speaker text-to-speech audio generation Gemini 2.0 Flash gemini-2.0-flash Audio, images, videos, and text Text Next generation features, speed, and realtime streaming. Gemini 2.0 Flash Preview Image Generation gemini-2.0-flash-preview-image-generation Audio, images, videos, and text Text, images Conversational image generation and editing Gemini 2.0 Flash-Lite gemini-2.0-flash-lite Audio, images, videos, and text Text Cost efficiency and low latency Gemini 1.5 Flash gemini-1.5-flash Audio, images, videos, and text Text Fast and versatile performance across a diverse variety of tasks Gemini 1.5 Flash-8B gemini-1.5-flash-8b Audio, images, videos, and text Text High volume and lower intelligence tasks Gemini 1.5 Pro gemini-1.5-pro Audio, images, videos, and text Text Complex reasoning tasks requiring more intelligence Gemini Embedding gemini-embedding-exp Text Text embeddings Measuring the relatedness of text strings Imagen 4 imagen-4.0-generate-preview-06-06 imagen-4.0-ultra-generate-preview-06-06 Text Images Our most up-to-date image generation model Imagen 3 imagen-3.0-generate-002 Text Images High quality image generation model Veo 2 veo-2.0-generate-001 Text, images Video High quality video generation Gemini 2.5 Flash Live gemini-live-2.5-flash-preview Audio, video, and text Text, audio Low-latency bidirectional voice and video interactions Gemini 2.0 Flash Live gemini-2.0-flash-live-001 Audio, video, and text Text, audio Low-latency bidirectional voice and video interactions You can view the rate limits for each model on the rate limits page . Gemini 2.5 Pro Gemini 2.5 Pro is our state-of-the-art thinking model, capable of reasoning over complex problems in code, math, and STEM, as well as analyzing large datasets, codebases, and documents using long context. Try in Google AI Studio Model details Property Description id_card Model code gemini-2.5-pro save Supported data types Inputs Audio, images, video, text, and PDF Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 65,536 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Search grounding Supported Image generation Not supported Audio generation Not supported Live API Not supported Thinking Supported Batch API Supported 123 Versions Read the model version patterns for more details. Stable: gemini-2.5-pro Preview: gemini-2.5-pro-preview-06-05 Preview: gemini-2.5-pro-preview-05-06 Preview: gemini-2.5-pro-preview-03-25 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Our best model in terms of price-performance, offering well-rounded capabilities. 2.5 Flash is best for large scale processing, low-latency, high volume tasks that require thinking, and agentic use cases. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash save Supported data types Inputs Text, images, video, audio Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 65,536 handyman Capabilities Audio generation Not supported Caching Supported Code execution Supported Function calling Supported Image generation Not supported Search grounding Supported Structured outputs Supported Thinking Supported Tuning Not supported Batch API Supported 123 Versions Read the model version patterns for more details. Stable: gemini-2.5-flash Preview: gemini-2.5-flash-preview-05-20 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash-Lite Preview A Gemini 2.5 Flash model optimized for cost efficiency and low latency. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-lite-preview-06-17 save Supported data types Inputs Text, images, video, and audio Output Text token_auto Token limits [*] Input token limit 1,000,000 Output token limit 64,000 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported URL Context Supported Search grounding Supported Image generation Not supported Audio generation Not supported Live API Not supported Thinking Supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.5-flash-lite-preview-06-17 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Native Audio Our native audio dialog models, with and without thinking, available through the Live API . These models provide interactive and unstructured conversational experiences, with style and control prompting. Try native audio in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-preview-native-audio-dialog & models/gemini-2.5-flash-exp-native-audio-thinking-dialog save Supported data types Inputs Audio, video, text Output Audio and text token_auto Token limits [*] Input token limit 128,000 Output token limit 8,000 handyman Capabilities Audio generation Supported Caching Not supported Code execution Not supported Function calling Supported Image generation Not supported Search grounding Supported Structured outputs Not supported Thinking Supported Tuning Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.5-flash-preview-05-20 Experimental: gemini-2.5-flash-exp-native-audio-thinking-dialog calendar_month Latest update May 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Preview Text-to-Speech Gemini 2.5 Flash Preview TTS is our price-performant text-to-speech model, delivering high control and transparency for structured workflows like podcast generation, audiobooks, customer support, and more. Gemini 2.5 Flash rate limits are more restricted since it is an experimental / preview model. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-preview-tts save Supported data types Inputs Text Output Audio token_auto Token limits [*] Input token limit 8,000 Output token limit 16,000 handyman Capabilities Structured outputs Not supported Caching Not supported Tuning Not supported Function calling Not supported Code execution Not supported Search Not supported Audio generation Supported Live API Not supported Thinking Not supported 123 Versions Read the model version patterns for more details. gemini-2.5-flash-preview-tts calendar_month Latest update May 2025 Gemini 2.5 Pro Preview Text-to-Speech Gemini 2.5 Pro Preview TTS is our most powerful text-to-speech model, delivering high control and transparency for structured workflows like podcast generation, audiobooks, customer support, and more. Gemini 2.5 Pro rate limits are more restricted since it is an experimental / preview model. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-pro-preview-tts save Supported data types Inputs Text Output Audio token_auto Token limits [*] Input token limit 8,000 Output token limit 16,000 handyman Capabilities Structured outputs Not supported Caching Not supported Tuning Not supported Function calling Not supported Code execution Not supported Search Not supported Audio generation Supported Live API Not supported Thinking Not supported 123 Versions Read the model version patterns for more details. gemini-2.5-pro-preview-tts calendar_month Latest update May 2025 Gemini 2.0 Flash Gemini 2.0 Flash delivers next-gen features and improved capabilities, including superior speed, native tool use, and a 1M token context window. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Not supported Live API Supported Thinking Experimental Batch API Supported 123 Versions Read the model version patterns for more details. Latest: gemini-2.0-flash Stable: gemini-2.0-flash-001 Experimental: gemini-2.0-flash-exp calendar_month Latest update February 2025 cognition_2 Knowledge cutoff August 2024 Gemini 2.0 Flash Preview Image Generation Gemini 2.0 Flash Preview Image Generation delivers improved image generation features, including generating and editing images conversationally. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-preview-image-generation save Supported data types Inputs Audio, images, video, and text Output Text and images token_auto Token limits [*] Input token limit 32,000 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Not supported Code execution Not Supported Search Not Supported Image generation Supported Audio generation Not supported Live API Not Supported Thinking Not Supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.0-flash-preview-image-generation gemini-2.0-flash-preview-image-generation is not currently supported in a number of countries in Europe, Middle East & Africa calendar_month Latest update May 2025 cognition_2 Knowledge cutoff August 2024 Gemini 2.0 Flash-Lite A Gemini 2.0 Flash model optimized for cost efficiency and low latency. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-lite save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Not supported Search Not supported Image generation Not supported Audio generation Not supported Live API Not supported Batch API Supported 123 Versions Read the model version patterns for more details. Latest: gemini-2.0-flash-lite Stable: gemini-2.0-flash-lite-001 calendar_month Latest update February 2025 cognition_2 Knowledge cutoff August 2024 Gemini 1.5 Flash Gemini 1.5 Flash is a fast and versatile multimodal model for scaling across diverse tasks. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-flash save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 3,600 Maximum video length 1 hour Maximum audio length Approximately 9.5 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-flash-latest Latest stable: gemini-1.5-flash Stable: gemini-1.5-flash-001 gemini-1.5-flash-002 calendar_month Latest update September 2024 Gemini 1.5 Flash-8B Gemini 1.5 Flash-8B is a small model designed for lower intelligence tasks. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-flash-8b save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 3,600 Maximum video length 1 hour Maximum audio length Approximately 9.5 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-flash-8b-latest Latest stable: gemini-1.5-flash-8b Stable: gemini-1.5-flash-8b-001 calendar_month Latest update October 2024 Gemini 1.5 Pro Try Gemini 2.5 Pro Preview , our most advanced Gemini model to date. Gemini 1.5 Pro is a mid-size multimodal model that is optimized for a wide-range of reasoning tasks. 1.5 Pro can process large amounts of data at once, including 2 hours of video, 19 hours of audio, codebases with 60,000 lines of code, or 2,000 pages of text. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-pro save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 2,097,152 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 7,200 Maximum video length 2 hours Maximum audio length Approximately 19 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-pro-latest Latest stable: gemini-1.5-pro Stable: gemini-1.5-pro-001 gemini-1.5-pro-002 calendar_month Latest update September 2024 Imagen 4 Imagen 4 is our latest image model, capable of generating highly detailed images with rich lighting, significantly better text rendering, and higher resolution output than previous models. Model details Property Description id_card Model code Gemini API imagen-4.0-generate-preview-06-06 imagen-4.0-ultra-generate-preview-06-06 save Supported data types Input Text Output Images token_auto Token limits [*] Input token limit 480 tokens (text) Output images 1 (Ultra) 1 to 4 (Standard) calendar_month Latest update June 2025 Imagen 3 Imagen 3 is our highest quality text-to-image model, capable of generating images with even better detail, richer lighting and fewer distracting artifacts than our previous models. Model details Property Description id_card Model code Gemini API imagen-3.0-generate-002 save Supported data types Input Text Output Images token_auto Token limits [*] Input token limit N/A Output images Up to 4 calendar_month Latest update February 2025 Veo 2 Veo 2 is our high quality text- and image-to-video model, capable of generating detailed videos, capturing the artistic nuance in your prompts. Model details Property Description id_card Model code Gemini API veo-2.0-generate-001 save Supported data types Input Text, image Output Video token_auto Limits Text input N/A Image input Any image resolution and aspect ratio up to 20MB file size Output video Up to 2 calendar_month Latest update April 2025 Gemini 2.5 Flash Live The Gemini 2.5 Flash Live model works with the Live API to enable low-latency bidirectional voice and video interactions with Gemini. The model can process text, audio, and video input, and it can provide text and audio output. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-live-2.5-flash-preview save Supported data types Inputs Audio, video, and text Output Text, and audio token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Supported Thinking Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-live-2.5-flash-preview calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.0 Flash Live The Gemini 2.0 Flash Live model works with the Live API to enable low-latency bidirectional voice and video interactions with Gemini. The model can process text, audio, and video input, and it can provide text and audio output. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-live-001 save Supported data types Inputs Audio, video, and text Output Text, and audio token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Supported Thinking Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.0-flash-live-001 calendar_month Latest update April 2025 cognition_2 Knowledge cutoff August 2024 Gemini Embedding Experimental Gemini embedding achieves a SOTA performance across many key dimensions including code, multi-lingual, and retrieval. Gemini Embedding rate limits are more restricted since it is an experimental model. Model details Property Description id_card Model code Gemini API gemini-embedding-exp-03-07 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 8,192 Output dimension size Elastic, supports: 3072, 1536, or 768 calendar_month Latest update March 2025 Text Embedding and Embedding Text Embedding Try our new experimental Gemini embedding model which achieves state-of-the-art performance. Text embeddings are used to measure the relatedness of strings and are widely used in many AI applications. text-embedding-004 achieves a stronger retrieval performance and outperforms existing models with comparable dimensions, on the standard MTEB embedding benchmarks. Model details Property Description id_card Model code Gemini API models/text-embedding-004 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 2,048 Output dimension size 768 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Not supported calendar_month Latest update April 2024 Embedding Note: Text Embedding is the newer version of the Embedding model. If you're creating a new project, use Text Embedding. You can use the Embedding model to generate text embeddings for input text. The Embedding model is optimized for creating embeddings with 768 dimensions for text of up to 2,048 tokens. Embedding model details Property Description id_card Model code models/embedding-001 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 2,048 Output dimension size 768 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Not supported calendar_month Latest update December 2023 AQA You can use the AQA model to perform Attributed Question-Answering (AQA)–related tasks over a document, corpus, or a set of passages. The AQA model returns answers to questions that are grounded in provided sources, along with estimating answerable probability. Model details Property Description id_card Model code models/aqa save Supported data types Input Text Output Text language Supported language English token_auto Token limits [*] Input token limit 7,168 Output token limit 1,024 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Supported calendar_month Latest update December 2023 See the examples to explore the capabilities of these model variations. [*] A token is equivalent to about 4 characters for Gemini models. 100 tokens are about 60-80 English words. Model version name patterns Gemini models are available in either stable , preview , or experimental versions. In your code, you can use one of the following model name formats to specify which model and version you want to use. Latest stable Points to the most recent stable version released for the specified model generation and variation. To specify the latest stable version, use the following pattern: -- . For example, gemini-2.0-flash . Stable Points to a specific stable model. Stable models usually don't change. Most production apps should use a specific stable model. To specify a stable version, use the following pattern: --- . For example, gemini-2.0-flash-001 . Preview Points to a preview model which may not be suitable for production use, come with more restrictive rate limits, but may have billing enabled. To specify a preview version, use the following pattern: --- . For example, gemini-2.5-pro-preview-06-05 . Experimental Points to an experimental model which may not be suitable for production use and come with more restrictive rate limits. We release experimental models to gather feedback and get our latest updates into the hands of developers quickly. To specify an experimental version, use the following pattern: --- . For example, gemini-2.0-pro-exp-02-05 . Experimental models In addition to stable models, the Gemini API offers experimental models which may not be suitable for production use and come with more restrictive rate limits. We release experimental models to gather feedback, get our latest updates into the hands of developers quickly, and highlight the pace of innovation happening at Google. What we learn from experimental launches informs how we release models more widely. An experimental model can be swapped for another without prior notice. We don't guarantee that an experimental model will become a stable model in the future. Previous experimental models As new versions or stable releases become available, we remove and replace experimental models. You can find the previous experimental models we released in the following section along with the replacement version: Model code Base model Replacement version gemini-2.5-flash-preview-04-17 Gemini 2.5 Flash gemini-2.5-flash-preview-05-20 gemini-2.0-flash-exp-image-generation Gemini 2.0 Flash gemini-2.0-flash-preview-image-generation gemini-2.5-pro-preview-06-05 Gemini 2.5 Pro gemini-2.5-pro gemini-2.5-pro-preview-05-06 Gemini 2.5 Pro gemini-2.5-pro gemini-2.5-pro-preview-03-25 Gemini 2.5 Pro gemini-2.5-pro gemini-2.0-flash-thinking-exp-01-21 Gemini 2.5 Flash gemini-2.5-flash-preview-04-17 gemini-2.0-pro-exp-02-05 Gemini 2.0 Pro Experimental gemini-2.5-pro-preview-03-25 gemini-2.0-flash-exp Gemini 2.0 Flash gemini-2.0-flash gemini-exp-1206 Gemini 2.0 Pro gemini-2.0-pro-exp-02-05 gemini-2.0-flash-thinking-exp-1219 Gemini 2.0 Flash Thinking gemini-2.0-flash-thinking-exp-01-21 gemini-exp-1121 Gemini gemini-exp-1206 gemini-exp-1114 Gemini gemini-exp-1206 gemini-1.5-pro-exp-0827 Gemini 1.5 Pro gemini-exp-1206 gemini-1.5-pro-exp-0801 Gemini 1.5 Pro gemini-exp-1206 gemini-1.5-flash-8b-exp-0924 Gemini 1.5 Flash-8B gemini-1.5-flash-8b gemini-1.5-flash-8b-exp-0827 Gemini 1.5 Flash-8B gemini-1.5-flash-8b Supported languages Gemini models are trained to work with the following languages: Arabic ( ar ) Bengali ( bn ) Bulgarian ( bg ) Chinese simplified and traditional ( zh ) Croatian ( hr ) Czech ( cs ) Danish ( da ) Dutch ( nl ) English ( en ) Estonian ( et ) Finnish ( fi ) French ( fr ) German ( de ) Greek ( el ) Hebrew ( iw ) Hindi ( hi ) Hungarian ( hu ) Indonesian ( id ) Italian ( it ) Japanese ( ja ) Korean ( ko ) Latvian ( lv ) Lithuanian ( lt ) Norwegian ( no ) Polish ( pl ) Portuguese ( pt ) Romanian ( ro ) Russian ( ru ) Serbian ( sr ) Slovak ( sk ) Slovenian ( sl ) Spanish ( es ) Swahili ( sw ) Swedish ( sv ) Thai ( th ) Turkish ( tr ) Ukrainian ( uk ) Vietnamese ( vi ) Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-07 UTC. \ No newline at end of file diff --git a/text_content/docs_models_2e366a43.txt b/text_content/docs_models_2e366a43.txt new file mode 100644 index 0000000000000000000000000000000000000000..fbc851d0b403da6a0867b8db75ec7f3d4eef4408 --- /dev/null +++ b/text_content/docs_models_2e366a43.txt @@ -0,0 +1,5 @@ +URL: https://ai.google.dev/gemini-api/docs/models#token-size +Title: Gemini models  |  Gemini API  |  Google AI for Developers +================================================== + +Gemini models | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Gemini models 2.5 Pro spark Our most powerful thinking model with maximum response accuracy and state-of-the-art performance Input audio, images, video, and text, get text responses Tackle difficult problems, analyze large databases, and more Best for complex coding, reasoning, and multimodal understanding 2.5 Flash spark Our best model in terms of price-performance, offering well-rounded capabilities. Input audio, images, video, and text, and get text responses Model thinks as needed; or, you can configure a thinking budget Best for low latency, high volume tasks that require thinking 2.5 Flash-Lite experiment A Gemini 2.5 Flash model optimized for cost efficiency and low latency. Input audio, images, video, and text, and get text responses Most cost-efficient model supporting high throughput Best for real time, low latency use cases Note: Gemini 2.5 Pro and 2.5 Flash come with thinking on by default . If you're migrating from a non-thinking model such as 2.0 Pro or Flash, we recommend you to review the Thinking guide first. Model variants The Gemini API offers different models that are optimized for specific use cases. Here's a brief overview of Gemini variants that are available: Model variant Input(s) Output Optimized for Gemini 2.5 Pro gemini-2.5-pro Audio, images, videos, text, and PDF Text Enhanced thinking and reasoning, multimodal understanding, advanced coding, and more Gemini 2.5 Flash gemini-2.5-flash Audio, images, videos, and text Text Adaptive thinking, cost efficiency Gemini 2.5 Flash-Lite Preview gemini-2.5-flash-lite-preview-06-17 Text, image, video, audio Text Most cost-efficient model supporting high throughput Gemini 2.5 Flash Native Audio gemini-2.5-flash-preview-native-audio-dialog & gemini-2.5-flash-exp-native-audio-thinking-dialog Audio, videos, and text Text and audio, interleaved High quality, natural conversational audio outputs, with or without thinking Gemini 2.5 Flash Preview TTS gemini-2.5-flash-preview-tts Text Audio Low latency, controllable, single- and multi-speaker text-to-speech audio generation Gemini 2.5 Pro Preview TTS gemini-2.5-pro-preview-tts Text Audio Low latency, controllable, single- and multi-speaker text-to-speech audio generation Gemini 2.0 Flash gemini-2.0-flash Audio, images, videos, and text Text Next generation features, speed, and realtime streaming. Gemini 2.0 Flash Preview Image Generation gemini-2.0-flash-preview-image-generation Audio, images, videos, and text Text, images Conversational image generation and editing Gemini 2.0 Flash-Lite gemini-2.0-flash-lite Audio, images, videos, and text Text Cost efficiency and low latency Gemini 1.5 Flash gemini-1.5-flash Audio, images, videos, and text Text Fast and versatile performance across a diverse variety of tasks Gemini 1.5 Flash-8B gemini-1.5-flash-8b Audio, images, videos, and text Text High volume and lower intelligence tasks Gemini 1.5 Pro gemini-1.5-pro Audio, images, videos, and text Text Complex reasoning tasks requiring more intelligence Gemini Embedding gemini-embedding-exp Text Text embeddings Measuring the relatedness of text strings Imagen 4 imagen-4.0-generate-preview-06-06 imagen-4.0-ultra-generate-preview-06-06 Text Images Our most up-to-date image generation model Imagen 3 imagen-3.0-generate-002 Text Images High quality image generation model Veo 2 veo-2.0-generate-001 Text, images Video High quality video generation Gemini 2.5 Flash Live gemini-live-2.5-flash-preview Audio, video, and text Text, audio Low-latency bidirectional voice and video interactions Gemini 2.0 Flash Live gemini-2.0-flash-live-001 Audio, video, and text Text, audio Low-latency bidirectional voice and video interactions You can view the rate limits for each model on the rate limits page . Gemini 2.5 Pro Gemini 2.5 Pro is our state-of-the-art thinking model, capable of reasoning over complex problems in code, math, and STEM, as well as analyzing large datasets, codebases, and documents using long context. Try in Google AI Studio Model details Property Description id_card Model code gemini-2.5-pro save Supported data types Inputs Audio, images, video, text, and PDF Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 65,536 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Search grounding Supported Image generation Not supported Audio generation Not supported Live API Not supported Thinking Supported Batch API Supported 123 Versions Read the model version patterns for more details. Stable: gemini-2.5-pro Preview: gemini-2.5-pro-preview-06-05 Preview: gemini-2.5-pro-preview-05-06 Preview: gemini-2.5-pro-preview-03-25 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Our best model in terms of price-performance, offering well-rounded capabilities. 2.5 Flash is best for large scale processing, low-latency, high volume tasks that require thinking, and agentic use cases. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash save Supported data types Inputs Text, images, video, audio Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 65,536 handyman Capabilities Audio generation Not supported Caching Supported Code execution Supported Function calling Supported Image generation Not supported Search grounding Supported Structured outputs Supported Thinking Supported Tuning Not supported Batch API Supported 123 Versions Read the model version patterns for more details. Stable: gemini-2.5-flash Preview: gemini-2.5-flash-preview-05-20 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash-Lite Preview A Gemini 2.5 Flash model optimized for cost efficiency and low latency. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-lite-preview-06-17 save Supported data types Inputs Text, images, video, and audio Output Text token_auto Token limits [*] Input token limit 1,000,000 Output token limit 64,000 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported URL Context Supported Search grounding Supported Image generation Not supported Audio generation Not supported Live API Not supported Thinking Supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.5-flash-lite-preview-06-17 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Native Audio Our native audio dialog models, with and without thinking, available through the Live API . These models provide interactive and unstructured conversational experiences, with style and control prompting. Try native audio in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-preview-native-audio-dialog & models/gemini-2.5-flash-exp-native-audio-thinking-dialog save Supported data types Inputs Audio, video, text Output Audio and text token_auto Token limits [*] Input token limit 128,000 Output token limit 8,000 handyman Capabilities Audio generation Supported Caching Not supported Code execution Not supported Function calling Supported Image generation Not supported Search grounding Supported Structured outputs Not supported Thinking Supported Tuning Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.5-flash-preview-05-20 Experimental: gemini-2.5-flash-exp-native-audio-thinking-dialog calendar_month Latest update May 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Preview Text-to-Speech Gemini 2.5 Flash Preview TTS is our price-performant text-to-speech model, delivering high control and transparency for structured workflows like podcast generation, audiobooks, customer support, and more. Gemini 2.5 Flash rate limits are more restricted since it is an experimental / preview model. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-preview-tts save Supported data types Inputs Text Output Audio token_auto Token limits [*] Input token limit 8,000 Output token limit 16,000 handyman Capabilities Structured outputs Not supported Caching Not supported Tuning Not supported Function calling Not supported Code execution Not supported Search Not supported Audio generation Supported Live API Not supported Thinking Not supported 123 Versions Read the model version patterns for more details. gemini-2.5-flash-preview-tts calendar_month Latest update May 2025 Gemini 2.5 Pro Preview Text-to-Speech Gemini 2.5 Pro Preview TTS is our most powerful text-to-speech model, delivering high control and transparency for structured workflows like podcast generation, audiobooks, customer support, and more. Gemini 2.5 Pro rate limits are more restricted since it is an experimental / preview model. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-pro-preview-tts save Supported data types Inputs Text Output Audio token_auto Token limits [*] Input token limit 8,000 Output token limit 16,000 handyman Capabilities Structured outputs Not supported Caching Not supported Tuning Not supported Function calling Not supported Code execution Not supported Search Not supported Audio generation Supported Live API Not supported Thinking Not supported 123 Versions Read the model version patterns for more details. gemini-2.5-pro-preview-tts calendar_month Latest update May 2025 Gemini 2.0 Flash Gemini 2.0 Flash delivers next-gen features and improved capabilities, including superior speed, native tool use, and a 1M token context window. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Not supported Live API Supported Thinking Experimental Batch API Supported 123 Versions Read the model version patterns for more details. Latest: gemini-2.0-flash Stable: gemini-2.0-flash-001 Experimental: gemini-2.0-flash-exp calendar_month Latest update February 2025 cognition_2 Knowledge cutoff August 2024 Gemini 2.0 Flash Preview Image Generation Gemini 2.0 Flash Preview Image Generation delivers improved image generation features, including generating and editing images conversationally. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-preview-image-generation save Supported data types Inputs Audio, images, video, and text Output Text and images token_auto Token limits [*] Input token limit 32,000 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Not supported Code execution Not Supported Search Not Supported Image generation Supported Audio generation Not supported Live API Not Supported Thinking Not Supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.0-flash-preview-image-generation gemini-2.0-flash-preview-image-generation is not currently supported in a number of countries in Europe, Middle East & Africa calendar_month Latest update May 2025 cognition_2 Knowledge cutoff August 2024 Gemini 2.0 Flash-Lite A Gemini 2.0 Flash model optimized for cost efficiency and low latency. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-lite save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Not supported Search Not supported Image generation Not supported Audio generation Not supported Live API Not supported Batch API Supported 123 Versions Read the model version patterns for more details. Latest: gemini-2.0-flash-lite Stable: gemini-2.0-flash-lite-001 calendar_month Latest update February 2025 cognition_2 Knowledge cutoff August 2024 Gemini 1.5 Flash Gemini 1.5 Flash is a fast and versatile multimodal model for scaling across diverse tasks. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-flash save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 3,600 Maximum video length 1 hour Maximum audio length Approximately 9.5 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-flash-latest Latest stable: gemini-1.5-flash Stable: gemini-1.5-flash-001 gemini-1.5-flash-002 calendar_month Latest update September 2024 Gemini 1.5 Flash-8B Gemini 1.5 Flash-8B is a small model designed for lower intelligence tasks. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-flash-8b save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 3,600 Maximum video length 1 hour Maximum audio length Approximately 9.5 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-flash-8b-latest Latest stable: gemini-1.5-flash-8b Stable: gemini-1.5-flash-8b-001 calendar_month Latest update October 2024 Gemini 1.5 Pro Try Gemini 2.5 Pro Preview , our most advanced Gemini model to date. Gemini 1.5 Pro is a mid-size multimodal model that is optimized for a wide-range of reasoning tasks. 1.5 Pro can process large amounts of data at once, including 2 hours of video, 19 hours of audio, codebases with 60,000 lines of code, or 2,000 pages of text. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-pro save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 2,097,152 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 7,200 Maximum video length 2 hours Maximum audio length Approximately 19 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-pro-latest Latest stable: gemini-1.5-pro Stable: gemini-1.5-pro-001 gemini-1.5-pro-002 calendar_month Latest update September 2024 Imagen 4 Imagen 4 is our latest image model, capable of generating highly detailed images with rich lighting, significantly better text rendering, and higher resolution output than previous models. Model details Property Description id_card Model code Gemini API imagen-4.0-generate-preview-06-06 imagen-4.0-ultra-generate-preview-06-06 save Supported data types Input Text Output Images token_auto Token limits [*] Input token limit 480 tokens (text) Output images 1 (Ultra) 1 to 4 (Standard) calendar_month Latest update June 2025 Imagen 3 Imagen 3 is our highest quality text-to-image model, capable of generating images with even better detail, richer lighting and fewer distracting artifacts than our previous models. Model details Property Description id_card Model code Gemini API imagen-3.0-generate-002 save Supported data types Input Text Output Images token_auto Token limits [*] Input token limit N/A Output images Up to 4 calendar_month Latest update February 2025 Veo 2 Veo 2 is our high quality text- and image-to-video model, capable of generating detailed videos, capturing the artistic nuance in your prompts. Model details Property Description id_card Model code Gemini API veo-2.0-generate-001 save Supported data types Input Text, image Output Video token_auto Limits Text input N/A Image input Any image resolution and aspect ratio up to 20MB file size Output video Up to 2 calendar_month Latest update April 2025 Gemini 2.5 Flash Live The Gemini 2.5 Flash Live model works with the Live API to enable low-latency bidirectional voice and video interactions with Gemini. The model can process text, audio, and video input, and it can provide text and audio output. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-live-2.5-flash-preview save Supported data types Inputs Audio, video, and text Output Text, and audio token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Supported Thinking Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-live-2.5-flash-preview calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.0 Flash Live The Gemini 2.0 Flash Live model works with the Live API to enable low-latency bidirectional voice and video interactions with Gemini. The model can process text, audio, and video input, and it can provide text and audio output. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-live-001 save Supported data types Inputs Audio, video, and text Output Text, and audio token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Supported Thinking Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.0-flash-live-001 calendar_month Latest update April 2025 cognition_2 Knowledge cutoff August 2024 Gemini Embedding Experimental Gemini embedding achieves a SOTA performance across many key dimensions including code, multi-lingual, and retrieval. Gemini Embedding rate limits are more restricted since it is an experimental model. Model details Property Description id_card Model code Gemini API gemini-embedding-exp-03-07 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 8,192 Output dimension size Elastic, supports: 3072, 1536, or 768 calendar_month Latest update March 2025 Text Embedding and Embedding Text Embedding Try our new experimental Gemini embedding model which achieves state-of-the-art performance. Text embeddings are used to measure the relatedness of strings and are widely used in many AI applications. text-embedding-004 achieves a stronger retrieval performance and outperforms existing models with comparable dimensions, on the standard MTEB embedding benchmarks. Model details Property Description id_card Model code Gemini API models/text-embedding-004 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 2,048 Output dimension size 768 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Not supported calendar_month Latest update April 2024 Embedding Note: Text Embedding is the newer version of the Embedding model. If you're creating a new project, use Text Embedding. You can use the Embedding model to generate text embeddings for input text. The Embedding model is optimized for creating embeddings with 768 dimensions for text of up to 2,048 tokens. Embedding model details Property Description id_card Model code models/embedding-001 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 2,048 Output dimension size 768 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Not supported calendar_month Latest update December 2023 AQA You can use the AQA model to perform Attributed Question-Answering (AQA)–related tasks over a document, corpus, or a set of passages. The AQA model returns answers to questions that are grounded in provided sources, along with estimating answerable probability. Model details Property Description id_card Model code models/aqa save Supported data types Input Text Output Text language Supported language English token_auto Token limits [*] Input token limit 7,168 Output token limit 1,024 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Supported calendar_month Latest update December 2023 See the examples to explore the capabilities of these model variations. [*] A token is equivalent to about 4 characters for Gemini models. 100 tokens are about 60-80 English words. Model version name patterns Gemini models are available in either stable , preview , or experimental versions. In your code, you can use one of the following model name formats to specify which model and version you want to use. Latest stable Points to the most recent stable version released for the specified model generation and variation. To specify the latest stable version, use the following pattern: -- . For example, gemini-2.0-flash . Stable Points to a specific stable model. Stable models usually don't change. Most production apps should use a specific stable model. To specify a stable version, use the following pattern: --- . For example, gemini-2.0-flash-001 . Preview Points to a preview model which may not be suitable for production use, come with more restrictive rate limits, but may have billing enabled. To specify a preview version, use the following pattern: --- . For example, gemini-2.5-pro-preview-06-05 . Experimental Points to an experimental model which may not be suitable for production use and come with more restrictive rate limits. We release experimental models to gather feedback and get our latest updates into the hands of developers quickly. To specify an experimental version, use the following pattern: --- . For example, gemini-2.0-pro-exp-02-05 . Experimental models In addition to stable models, the Gemini API offers experimental models which may not be suitable for production use and come with more restrictive rate limits. We release experimental models to gather feedback, get our latest updates into the hands of developers quickly, and highlight the pace of innovation happening at Google. What we learn from experimental launches informs how we release models more widely. An experimental model can be swapped for another without prior notice. We don't guarantee that an experimental model will become a stable model in the future. Previous experimental models As new versions or stable releases become available, we remove and replace experimental models. You can find the previous experimental models we released in the following section along with the replacement version: Model code Base model Replacement version gemini-2.5-flash-preview-04-17 Gemini 2.5 Flash gemini-2.5-flash-preview-05-20 gemini-2.0-flash-exp-image-generation Gemini 2.0 Flash gemini-2.0-flash-preview-image-generation gemini-2.5-pro-preview-06-05 Gemini 2.5 Pro gemini-2.5-pro gemini-2.5-pro-preview-05-06 Gemini 2.5 Pro gemini-2.5-pro gemini-2.5-pro-preview-03-25 Gemini 2.5 Pro gemini-2.5-pro gemini-2.0-flash-thinking-exp-01-21 Gemini 2.5 Flash gemini-2.5-flash-preview-04-17 gemini-2.0-pro-exp-02-05 Gemini 2.0 Pro Experimental gemini-2.5-pro-preview-03-25 gemini-2.0-flash-exp Gemini 2.0 Flash gemini-2.0-flash gemini-exp-1206 Gemini 2.0 Pro gemini-2.0-pro-exp-02-05 gemini-2.0-flash-thinking-exp-1219 Gemini 2.0 Flash Thinking gemini-2.0-flash-thinking-exp-01-21 gemini-exp-1121 Gemini gemini-exp-1206 gemini-exp-1114 Gemini gemini-exp-1206 gemini-1.5-pro-exp-0827 Gemini 1.5 Pro gemini-exp-1206 gemini-1.5-pro-exp-0801 Gemini 1.5 Pro gemini-exp-1206 gemini-1.5-flash-8b-exp-0924 Gemini 1.5 Flash-8B gemini-1.5-flash-8b gemini-1.5-flash-8b-exp-0827 Gemini 1.5 Flash-8B gemini-1.5-flash-8b Supported languages Gemini models are trained to work with the following languages: Arabic ( ar ) Bengali ( bn ) Bulgarian ( bg ) Chinese simplified and traditional ( zh ) Croatian ( hr ) Czech ( cs ) Danish ( da ) Dutch ( nl ) English ( en ) Estonian ( et ) Finnish ( fi ) French ( fr ) German ( de ) Greek ( el ) Hebrew ( iw ) Hindi ( hi ) Hungarian ( hu ) Indonesian ( id ) Italian ( it ) Japanese ( ja ) Korean ( ko ) Latvian ( lv ) Lithuanian ( lt ) Norwegian ( no ) Polish ( pl ) Portuguese ( pt ) Romanian ( ro ) Russian ( ru ) Serbian ( sr ) Slovak ( sk ) Slovenian ( sl ) Spanish ( es ) Swahili ( sw ) Swedish ( sv ) Thai ( th ) Turkish ( tr ) Ukrainian ( uk ) Vietnamese ( vi ) Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-07 UTC. \ No newline at end of file diff --git a/text_content/docs_models_509ccf1d.txt b/text_content/docs_models_509ccf1d.txt new file mode 100644 index 0000000000000000000000000000000000000000..6af22f19b379885ee049bc05034331da6b3a4b5b --- /dev/null +++ b/text_content/docs_models_509ccf1d.txt @@ -0,0 +1,5 @@ +URL: https://ai.google.dev/gemini-api/docs/models#gemini-2.5-flash-lite +Title: Gemini models  |  Gemini API  |  Google AI for Developers +================================================== + +Gemini models | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Gemini models 2.5 Pro spark Our most powerful thinking model with maximum response accuracy and state-of-the-art performance Input audio, images, video, and text, get text responses Tackle difficult problems, analyze large databases, and more Best for complex coding, reasoning, and multimodal understanding 2.5 Flash spark Our best model in terms of price-performance, offering well-rounded capabilities. Input audio, images, video, and text, and get text responses Model thinks as needed; or, you can configure a thinking budget Best for low latency, high volume tasks that require thinking 2.5 Flash-Lite experiment A Gemini 2.5 Flash model optimized for cost efficiency and low latency. Input audio, images, video, and text, and get text responses Most cost-efficient model supporting high throughput Best for real time, low latency use cases Note: Gemini 2.5 Pro and 2.5 Flash come with thinking on by default . If you're migrating from a non-thinking model such as 2.0 Pro or Flash, we recommend you to review the Thinking guide first. Model variants The Gemini API offers different models that are optimized for specific use cases. Here's a brief overview of Gemini variants that are available: Model variant Input(s) Output Optimized for Gemini 2.5 Pro gemini-2.5-pro Audio, images, videos, text, and PDF Text Enhanced thinking and reasoning, multimodal understanding, advanced coding, and more Gemini 2.5 Flash gemini-2.5-flash Audio, images, videos, and text Text Adaptive thinking, cost efficiency Gemini 2.5 Flash-Lite Preview gemini-2.5-flash-lite-preview-06-17 Text, image, video, audio Text Most cost-efficient model supporting high throughput Gemini 2.5 Flash Native Audio gemini-2.5-flash-preview-native-audio-dialog & gemini-2.5-flash-exp-native-audio-thinking-dialog Audio, videos, and text Text and audio, interleaved High quality, natural conversational audio outputs, with or without thinking Gemini 2.5 Flash Preview TTS gemini-2.5-flash-preview-tts Text Audio Low latency, controllable, single- and multi-speaker text-to-speech audio generation Gemini 2.5 Pro Preview TTS gemini-2.5-pro-preview-tts Text Audio Low latency, controllable, single- and multi-speaker text-to-speech audio generation Gemini 2.0 Flash gemini-2.0-flash Audio, images, videos, and text Text Next generation features, speed, and realtime streaming. Gemini 2.0 Flash Preview Image Generation gemini-2.0-flash-preview-image-generation Audio, images, videos, and text Text, images Conversational image generation and editing Gemini 2.0 Flash-Lite gemini-2.0-flash-lite Audio, images, videos, and text Text Cost efficiency and low latency Gemini 1.5 Flash gemini-1.5-flash Audio, images, videos, and text Text Fast and versatile performance across a diverse variety of tasks Gemini 1.5 Flash-8B gemini-1.5-flash-8b Audio, images, videos, and text Text High volume and lower intelligence tasks Gemini 1.5 Pro gemini-1.5-pro Audio, images, videos, and text Text Complex reasoning tasks requiring more intelligence Gemini Embedding gemini-embedding-exp Text Text embeddings Measuring the relatedness of text strings Imagen 4 imagen-4.0-generate-preview-06-06 imagen-4.0-ultra-generate-preview-06-06 Text Images Our most up-to-date image generation model Imagen 3 imagen-3.0-generate-002 Text Images High quality image generation model Veo 2 veo-2.0-generate-001 Text, images Video High quality video generation Gemini 2.5 Flash Live gemini-live-2.5-flash-preview Audio, video, and text Text, audio Low-latency bidirectional voice and video interactions Gemini 2.0 Flash Live gemini-2.0-flash-live-001 Audio, video, and text Text, audio Low-latency bidirectional voice and video interactions You can view the rate limits for each model on the rate limits page . Gemini 2.5 Pro Gemini 2.5 Pro is our state-of-the-art thinking model, capable of reasoning over complex problems in code, math, and STEM, as well as analyzing large datasets, codebases, and documents using long context. Try in Google AI Studio Model details Property Description id_card Model code gemini-2.5-pro save Supported data types Inputs Audio, images, video, text, and PDF Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 65,536 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Search grounding Supported Image generation Not supported Audio generation Not supported Live API Not supported Thinking Supported Batch API Supported 123 Versions Read the model version patterns for more details. Stable: gemini-2.5-pro Preview: gemini-2.5-pro-preview-06-05 Preview: gemini-2.5-pro-preview-05-06 Preview: gemini-2.5-pro-preview-03-25 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Our best model in terms of price-performance, offering well-rounded capabilities. 2.5 Flash is best for large scale processing, low-latency, high volume tasks that require thinking, and agentic use cases. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash save Supported data types Inputs Text, images, video, audio Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 65,536 handyman Capabilities Audio generation Not supported Caching Supported Code execution Supported Function calling Supported Image generation Not supported Search grounding Supported Structured outputs Supported Thinking Supported Tuning Not supported Batch API Supported 123 Versions Read the model version patterns for more details. Stable: gemini-2.5-flash Preview: gemini-2.5-flash-preview-05-20 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash-Lite Preview A Gemini 2.5 Flash model optimized for cost efficiency and low latency. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-lite-preview-06-17 save Supported data types Inputs Text, images, video, and audio Output Text token_auto Token limits [*] Input token limit 1,000,000 Output token limit 64,000 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported URL Context Supported Search grounding Supported Image generation Not supported Audio generation Not supported Live API Not supported Thinking Supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.5-flash-lite-preview-06-17 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Native Audio Our native audio dialog models, with and without thinking, available through the Live API . These models provide interactive and unstructured conversational experiences, with style and control prompting. Try native audio in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-preview-native-audio-dialog & models/gemini-2.5-flash-exp-native-audio-thinking-dialog save Supported data types Inputs Audio, video, text Output Audio and text token_auto Token limits [*] Input token limit 128,000 Output token limit 8,000 handyman Capabilities Audio generation Supported Caching Not supported Code execution Not supported Function calling Supported Image generation Not supported Search grounding Supported Structured outputs Not supported Thinking Supported Tuning Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.5-flash-preview-05-20 Experimental: gemini-2.5-flash-exp-native-audio-thinking-dialog calendar_month Latest update May 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Preview Text-to-Speech Gemini 2.5 Flash Preview TTS is our price-performant text-to-speech model, delivering high control and transparency for structured workflows like podcast generation, audiobooks, customer support, and more. Gemini 2.5 Flash rate limits are more restricted since it is an experimental / preview model. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-preview-tts save Supported data types Inputs Text Output Audio token_auto Token limits [*] Input token limit 8,000 Output token limit 16,000 handyman Capabilities Structured outputs Not supported Caching Not supported Tuning Not supported Function calling Not supported Code execution Not supported Search Not supported Audio generation Supported Live API Not supported Thinking Not supported 123 Versions Read the model version patterns for more details. gemini-2.5-flash-preview-tts calendar_month Latest update May 2025 Gemini 2.5 Pro Preview Text-to-Speech Gemini 2.5 Pro Preview TTS is our most powerful text-to-speech model, delivering high control and transparency for structured workflows like podcast generation, audiobooks, customer support, and more. Gemini 2.5 Pro rate limits are more restricted since it is an experimental / preview model. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-pro-preview-tts save Supported data types Inputs Text Output Audio token_auto Token limits [*] Input token limit 8,000 Output token limit 16,000 handyman Capabilities Structured outputs Not supported Caching Not supported Tuning Not supported Function calling Not supported Code execution Not supported Search Not supported Audio generation Supported Live API Not supported Thinking Not supported 123 Versions Read the model version patterns for more details. gemini-2.5-pro-preview-tts calendar_month Latest update May 2025 Gemini 2.0 Flash Gemini 2.0 Flash delivers next-gen features and improved capabilities, including superior speed, native tool use, and a 1M token context window. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Not supported Live API Supported Thinking Experimental Batch API Supported 123 Versions Read the model version patterns for more details. Latest: gemini-2.0-flash Stable: gemini-2.0-flash-001 Experimental: gemini-2.0-flash-exp calendar_month Latest update February 2025 cognition_2 Knowledge cutoff August 2024 Gemini 2.0 Flash Preview Image Generation Gemini 2.0 Flash Preview Image Generation delivers improved image generation features, including generating and editing images conversationally. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-preview-image-generation save Supported data types Inputs Audio, images, video, and text Output Text and images token_auto Token limits [*] Input token limit 32,000 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Not supported Code execution Not Supported Search Not Supported Image generation Supported Audio generation Not supported Live API Not Supported Thinking Not Supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.0-flash-preview-image-generation gemini-2.0-flash-preview-image-generation is not currently supported in a number of countries in Europe, Middle East & Africa calendar_month Latest update May 2025 cognition_2 Knowledge cutoff August 2024 Gemini 2.0 Flash-Lite A Gemini 2.0 Flash model optimized for cost efficiency and low latency. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-lite save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Not supported Search Not supported Image generation Not supported Audio generation Not supported Live API Not supported Batch API Supported 123 Versions Read the model version patterns for more details. Latest: gemini-2.0-flash-lite Stable: gemini-2.0-flash-lite-001 calendar_month Latest update February 2025 cognition_2 Knowledge cutoff August 2024 Gemini 1.5 Flash Gemini 1.5 Flash is a fast and versatile multimodal model for scaling across diverse tasks. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-flash save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 3,600 Maximum video length 1 hour Maximum audio length Approximately 9.5 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-flash-latest Latest stable: gemini-1.5-flash Stable: gemini-1.5-flash-001 gemini-1.5-flash-002 calendar_month Latest update September 2024 Gemini 1.5 Flash-8B Gemini 1.5 Flash-8B is a small model designed for lower intelligence tasks. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-flash-8b save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 3,600 Maximum video length 1 hour Maximum audio length Approximately 9.5 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-flash-8b-latest Latest stable: gemini-1.5-flash-8b Stable: gemini-1.5-flash-8b-001 calendar_month Latest update October 2024 Gemini 1.5 Pro Try Gemini 2.5 Pro Preview , our most advanced Gemini model to date. Gemini 1.5 Pro is a mid-size multimodal model that is optimized for a wide-range of reasoning tasks. 1.5 Pro can process large amounts of data at once, including 2 hours of video, 19 hours of audio, codebases with 60,000 lines of code, or 2,000 pages of text. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-pro save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 2,097,152 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 7,200 Maximum video length 2 hours Maximum audio length Approximately 19 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-pro-latest Latest stable: gemini-1.5-pro Stable: gemini-1.5-pro-001 gemini-1.5-pro-002 calendar_month Latest update September 2024 Imagen 4 Imagen 4 is our latest image model, capable of generating highly detailed images with rich lighting, significantly better text rendering, and higher resolution output than previous models. Model details Property Description id_card Model code Gemini API imagen-4.0-generate-preview-06-06 imagen-4.0-ultra-generate-preview-06-06 save Supported data types Input Text Output Images token_auto Token limits [*] Input token limit 480 tokens (text) Output images 1 (Ultra) 1 to 4 (Standard) calendar_month Latest update June 2025 Imagen 3 Imagen 3 is our highest quality text-to-image model, capable of generating images with even better detail, richer lighting and fewer distracting artifacts than our previous models. Model details Property Description id_card Model code Gemini API imagen-3.0-generate-002 save Supported data types Input Text Output Images token_auto Token limits [*] Input token limit N/A Output images Up to 4 calendar_month Latest update February 2025 Veo 2 Veo 2 is our high quality text- and image-to-video model, capable of generating detailed videos, capturing the artistic nuance in your prompts. Model details Property Description id_card Model code Gemini API veo-2.0-generate-001 save Supported data types Input Text, image Output Video token_auto Limits Text input N/A Image input Any image resolution and aspect ratio up to 20MB file size Output video Up to 2 calendar_month Latest update April 2025 Gemini 2.5 Flash Live The Gemini 2.5 Flash Live model works with the Live API to enable low-latency bidirectional voice and video interactions with Gemini. The model can process text, audio, and video input, and it can provide text and audio output. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-live-2.5-flash-preview save Supported data types Inputs Audio, video, and text Output Text, and audio token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Supported Thinking Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-live-2.5-flash-preview calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.0 Flash Live The Gemini 2.0 Flash Live model works with the Live API to enable low-latency bidirectional voice and video interactions with Gemini. The model can process text, audio, and video input, and it can provide text and audio output. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-live-001 save Supported data types Inputs Audio, video, and text Output Text, and audio token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Supported Thinking Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.0-flash-live-001 calendar_month Latest update April 2025 cognition_2 Knowledge cutoff August 2024 Gemini Embedding Experimental Gemini embedding achieves a SOTA performance across many key dimensions including code, multi-lingual, and retrieval. Gemini Embedding rate limits are more restricted since it is an experimental model. Model details Property Description id_card Model code Gemini API gemini-embedding-exp-03-07 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 8,192 Output dimension size Elastic, supports: 3072, 1536, or 768 calendar_month Latest update March 2025 Text Embedding and Embedding Text Embedding Try our new experimental Gemini embedding model which achieves state-of-the-art performance. Text embeddings are used to measure the relatedness of strings and are widely used in many AI applications. text-embedding-004 achieves a stronger retrieval performance and outperforms existing models with comparable dimensions, on the standard MTEB embedding benchmarks. Model details Property Description id_card Model code Gemini API models/text-embedding-004 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 2,048 Output dimension size 768 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Not supported calendar_month Latest update April 2024 Embedding Note: Text Embedding is the newer version of the Embedding model. If you're creating a new project, use Text Embedding. You can use the Embedding model to generate text embeddings for input text. The Embedding model is optimized for creating embeddings with 768 dimensions for text of up to 2,048 tokens. Embedding model details Property Description id_card Model code models/embedding-001 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 2,048 Output dimension size 768 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Not supported calendar_month Latest update December 2023 AQA You can use the AQA model to perform Attributed Question-Answering (AQA)–related tasks over a document, corpus, or a set of passages. The AQA model returns answers to questions that are grounded in provided sources, along with estimating answerable probability. Model details Property Description id_card Model code models/aqa save Supported data types Input Text Output Text language Supported language English token_auto Token limits [*] Input token limit 7,168 Output token limit 1,024 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Supported calendar_month Latest update December 2023 See the examples to explore the capabilities of these model variations. [*] A token is equivalent to about 4 characters for Gemini models. 100 tokens are about 60-80 English words. Model version name patterns Gemini models are available in either stable , preview , or experimental versions. In your code, you can use one of the following model name formats to specify which model and version you want to use. Latest stable Points to the most recent stable version released for the specified model generation and variation. To specify the latest stable version, use the following pattern: -- . For example, gemini-2.0-flash . Stable Points to a specific stable model. Stable models usually don't change. Most production apps should use a specific stable model. To specify a stable version, use the following pattern: --- . For example, gemini-2.0-flash-001 . Preview Points to a preview model which may not be suitable for production use, come with more restrictive rate limits, but may have billing enabled. To specify a preview version, use the following pattern: --- . For example, gemini-2.5-pro-preview-06-05 . Experimental Points to an experimental model which may not be suitable for production use and come with more restrictive rate limits. We release experimental models to gather feedback and get our latest updates into the hands of developers quickly. To specify an experimental version, use the following pattern: --- . For example, gemini-2.0-pro-exp-02-05 . Experimental models In addition to stable models, the Gemini API offers experimental models which may not be suitable for production use and come with more restrictive rate limits. We release experimental models to gather feedback, get our latest updates into the hands of developers quickly, and highlight the pace of innovation happening at Google. What we learn from experimental launches informs how we release models more widely. An experimental model can be swapped for another without prior notice. We don't guarantee that an experimental model will become a stable model in the future. Previous experimental models As new versions or stable releases become available, we remove and replace experimental models. You can find the previous experimental models we released in the following section along with the replacement version: Model code Base model Replacement version gemini-2.5-flash-preview-04-17 Gemini 2.5 Flash gemini-2.5-flash-preview-05-20 gemini-2.0-flash-exp-image-generation Gemini 2.0 Flash gemini-2.0-flash-preview-image-generation gemini-2.5-pro-preview-06-05 Gemini 2.5 Pro gemini-2.5-pro gemini-2.5-pro-preview-05-06 Gemini 2.5 Pro gemini-2.5-pro gemini-2.5-pro-preview-03-25 Gemini 2.5 Pro gemini-2.5-pro gemini-2.0-flash-thinking-exp-01-21 Gemini 2.5 Flash gemini-2.5-flash-preview-04-17 gemini-2.0-pro-exp-02-05 Gemini 2.0 Pro Experimental gemini-2.5-pro-preview-03-25 gemini-2.0-flash-exp Gemini 2.0 Flash gemini-2.0-flash gemini-exp-1206 Gemini 2.0 Pro gemini-2.0-pro-exp-02-05 gemini-2.0-flash-thinking-exp-1219 Gemini 2.0 Flash Thinking gemini-2.0-flash-thinking-exp-01-21 gemini-exp-1121 Gemini gemini-exp-1206 gemini-exp-1114 Gemini gemini-exp-1206 gemini-1.5-pro-exp-0827 Gemini 1.5 Pro gemini-exp-1206 gemini-1.5-pro-exp-0801 Gemini 1.5 Pro gemini-exp-1206 gemini-1.5-flash-8b-exp-0924 Gemini 1.5 Flash-8B gemini-1.5-flash-8b gemini-1.5-flash-8b-exp-0827 Gemini 1.5 Flash-8B gemini-1.5-flash-8b Supported languages Gemini models are trained to work with the following languages: Arabic ( ar ) Bengali ( bn ) Bulgarian ( bg ) Chinese simplified and traditional ( zh ) Croatian ( hr ) Czech ( cs ) Danish ( da ) Dutch ( nl ) English ( en ) Estonian ( et ) Finnish ( fi ) French ( fr ) German ( de ) Greek ( el ) Hebrew ( iw ) Hindi ( hi ) Hungarian ( hu ) Indonesian ( id ) Italian ( it ) Japanese ( ja ) Korean ( ko ) Latvian ( lv ) Lithuanian ( lt ) Norwegian ( no ) Polish ( pl ) Portuguese ( pt ) Romanian ( ro ) Russian ( ru ) Serbian ( sr ) Slovak ( sk ) Slovenian ( sl ) Spanish ( es ) Swahili ( sw ) Swedish ( sv ) Thai ( th ) Turkish ( tr ) Ukrainian ( uk ) Vietnamese ( vi ) Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-07 UTC. \ No newline at end of file diff --git a/text_content/docs_models_6ca2ad38.txt b/text_content/docs_models_6ca2ad38.txt new file mode 100644 index 0000000000000000000000000000000000000000..ed1d967da84f4595858ba708b2dcb03b7301b2b2 --- /dev/null +++ b/text_content/docs_models_6ca2ad38.txt @@ -0,0 +1,5 @@ +URL: https://ai.google.dev/gemini-api/docs/models#gemini-2.5-pro-preview-tts +Title: Gemini models  |  Gemini API  |  Google AI for Developers +================================================== + +Gemini models | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Gemini models 2.5 Pro spark Our most powerful thinking model with maximum response accuracy and state-of-the-art performance Input audio, images, video, and text, get text responses Tackle difficult problems, analyze large databases, and more Best for complex coding, reasoning, and multimodal understanding 2.5 Flash spark Our best model in terms of price-performance, offering well-rounded capabilities. Input audio, images, video, and text, and get text responses Model thinks as needed; or, you can configure a thinking budget Best for low latency, high volume tasks that require thinking 2.5 Flash-Lite experiment A Gemini 2.5 Flash model optimized for cost efficiency and low latency. Input audio, images, video, and text, and get text responses Most cost-efficient model supporting high throughput Best for real time, low latency use cases Note: Gemini 2.5 Pro and 2.5 Flash come with thinking on by default . If you're migrating from a non-thinking model such as 2.0 Pro or Flash, we recommend you to review the Thinking guide first. Model variants The Gemini API offers different models that are optimized for specific use cases. Here's a brief overview of Gemini variants that are available: Model variant Input(s) Output Optimized for Gemini 2.5 Pro gemini-2.5-pro Audio, images, videos, text, and PDF Text Enhanced thinking and reasoning, multimodal understanding, advanced coding, and more Gemini 2.5 Flash gemini-2.5-flash Audio, images, videos, and text Text Adaptive thinking, cost efficiency Gemini 2.5 Flash-Lite Preview gemini-2.5-flash-lite-preview-06-17 Text, image, video, audio Text Most cost-efficient model supporting high throughput Gemini 2.5 Flash Native Audio gemini-2.5-flash-preview-native-audio-dialog & gemini-2.5-flash-exp-native-audio-thinking-dialog Audio, videos, and text Text and audio, interleaved High quality, natural conversational audio outputs, with or without thinking Gemini 2.5 Flash Preview TTS gemini-2.5-flash-preview-tts Text Audio Low latency, controllable, single- and multi-speaker text-to-speech audio generation Gemini 2.5 Pro Preview TTS gemini-2.5-pro-preview-tts Text Audio Low latency, controllable, single- and multi-speaker text-to-speech audio generation Gemini 2.0 Flash gemini-2.0-flash Audio, images, videos, and text Text Next generation features, speed, and realtime streaming. Gemini 2.0 Flash Preview Image Generation gemini-2.0-flash-preview-image-generation Audio, images, videos, and text Text, images Conversational image generation and editing Gemini 2.0 Flash-Lite gemini-2.0-flash-lite Audio, images, videos, and text Text Cost efficiency and low latency Gemini 1.5 Flash gemini-1.5-flash Audio, images, videos, and text Text Fast and versatile performance across a diverse variety of tasks Gemini 1.5 Flash-8B gemini-1.5-flash-8b Audio, images, videos, and text Text High volume and lower intelligence tasks Gemini 1.5 Pro gemini-1.5-pro Audio, images, videos, and text Text Complex reasoning tasks requiring more intelligence Gemini Embedding gemini-embedding-exp Text Text embeddings Measuring the relatedness of text strings Imagen 4 imagen-4.0-generate-preview-06-06 imagen-4.0-ultra-generate-preview-06-06 Text Images Our most up-to-date image generation model Imagen 3 imagen-3.0-generate-002 Text Images High quality image generation model Veo 2 veo-2.0-generate-001 Text, images Video High quality video generation Gemini 2.5 Flash Live gemini-live-2.5-flash-preview Audio, video, and text Text, audio Low-latency bidirectional voice and video interactions Gemini 2.0 Flash Live gemini-2.0-flash-live-001 Audio, video, and text Text, audio Low-latency bidirectional voice and video interactions You can view the rate limits for each model on the rate limits page . Gemini 2.5 Pro Gemini 2.5 Pro is our state-of-the-art thinking model, capable of reasoning over complex problems in code, math, and STEM, as well as analyzing large datasets, codebases, and documents using long context. Try in Google AI Studio Model details Property Description id_card Model code gemini-2.5-pro save Supported data types Inputs Audio, images, video, text, and PDF Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 65,536 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Search grounding Supported Image generation Not supported Audio generation Not supported Live API Not supported Thinking Supported Batch API Supported 123 Versions Read the model version patterns for more details. Stable: gemini-2.5-pro Preview: gemini-2.5-pro-preview-06-05 Preview: gemini-2.5-pro-preview-05-06 Preview: gemini-2.5-pro-preview-03-25 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Our best model in terms of price-performance, offering well-rounded capabilities. 2.5 Flash is best for large scale processing, low-latency, high volume tasks that require thinking, and agentic use cases. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash save Supported data types Inputs Text, images, video, audio Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 65,536 handyman Capabilities Audio generation Not supported Caching Supported Code execution Supported Function calling Supported Image generation Not supported Search grounding Supported Structured outputs Supported Thinking Supported Tuning Not supported Batch API Supported 123 Versions Read the model version patterns for more details. Stable: gemini-2.5-flash Preview: gemini-2.5-flash-preview-05-20 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash-Lite Preview A Gemini 2.5 Flash model optimized for cost efficiency and low latency. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-lite-preview-06-17 save Supported data types Inputs Text, images, video, and audio Output Text token_auto Token limits [*] Input token limit 1,000,000 Output token limit 64,000 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported URL Context Supported Search grounding Supported Image generation Not supported Audio generation Not supported Live API Not supported Thinking Supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.5-flash-lite-preview-06-17 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Native Audio Our native audio dialog models, with and without thinking, available through the Live API . These models provide interactive and unstructured conversational experiences, with style and control prompting. Try native audio in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-preview-native-audio-dialog & models/gemini-2.5-flash-exp-native-audio-thinking-dialog save Supported data types Inputs Audio, video, text Output Audio and text token_auto Token limits [*] Input token limit 128,000 Output token limit 8,000 handyman Capabilities Audio generation Supported Caching Not supported Code execution Not supported Function calling Supported Image generation Not supported Search grounding Supported Structured outputs Not supported Thinking Supported Tuning Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.5-flash-preview-05-20 Experimental: gemini-2.5-flash-exp-native-audio-thinking-dialog calendar_month Latest update May 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Preview Text-to-Speech Gemini 2.5 Flash Preview TTS is our price-performant text-to-speech model, delivering high control and transparency for structured workflows like podcast generation, audiobooks, customer support, and more. Gemini 2.5 Flash rate limits are more restricted since it is an experimental / preview model. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-preview-tts save Supported data types Inputs Text Output Audio token_auto Token limits [*] Input token limit 8,000 Output token limit 16,000 handyman Capabilities Structured outputs Not supported Caching Not supported Tuning Not supported Function calling Not supported Code execution Not supported Search Not supported Audio generation Supported Live API Not supported Thinking Not supported 123 Versions Read the model version patterns for more details. gemini-2.5-flash-preview-tts calendar_month Latest update May 2025 Gemini 2.5 Pro Preview Text-to-Speech Gemini 2.5 Pro Preview TTS is our most powerful text-to-speech model, delivering high control and transparency for structured workflows like podcast generation, audiobooks, customer support, and more. Gemini 2.5 Pro rate limits are more restricted since it is an experimental / preview model. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-pro-preview-tts save Supported data types Inputs Text Output Audio token_auto Token limits [*] Input token limit 8,000 Output token limit 16,000 handyman Capabilities Structured outputs Not supported Caching Not supported Tuning Not supported Function calling Not supported Code execution Not supported Search Not supported Audio generation Supported Live API Not supported Thinking Not supported 123 Versions Read the model version patterns for more details. gemini-2.5-pro-preview-tts calendar_month Latest update May 2025 Gemini 2.0 Flash Gemini 2.0 Flash delivers next-gen features and improved capabilities, including superior speed, native tool use, and a 1M token context window. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Not supported Live API Supported Thinking Experimental Batch API Supported 123 Versions Read the model version patterns for more details. Latest: gemini-2.0-flash Stable: gemini-2.0-flash-001 Experimental: gemini-2.0-flash-exp calendar_month Latest update February 2025 cognition_2 Knowledge cutoff August 2024 Gemini 2.0 Flash Preview Image Generation Gemini 2.0 Flash Preview Image Generation delivers improved image generation features, including generating and editing images conversationally. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-preview-image-generation save Supported data types Inputs Audio, images, video, and text Output Text and images token_auto Token limits [*] Input token limit 32,000 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Not supported Code execution Not Supported Search Not Supported Image generation Supported Audio generation Not supported Live API Not Supported Thinking Not Supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.0-flash-preview-image-generation gemini-2.0-flash-preview-image-generation is not currently supported in a number of countries in Europe, Middle East & Africa calendar_month Latest update May 2025 cognition_2 Knowledge cutoff August 2024 Gemini 2.0 Flash-Lite A Gemini 2.0 Flash model optimized for cost efficiency and low latency. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-lite save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Not supported Search Not supported Image generation Not supported Audio generation Not supported Live API Not supported Batch API Supported 123 Versions Read the model version patterns for more details. Latest: gemini-2.0-flash-lite Stable: gemini-2.0-flash-lite-001 calendar_month Latest update February 2025 cognition_2 Knowledge cutoff August 2024 Gemini 1.5 Flash Gemini 1.5 Flash is a fast and versatile multimodal model for scaling across diverse tasks. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-flash save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 3,600 Maximum video length 1 hour Maximum audio length Approximately 9.5 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-flash-latest Latest stable: gemini-1.5-flash Stable: gemini-1.5-flash-001 gemini-1.5-flash-002 calendar_month Latest update September 2024 Gemini 1.5 Flash-8B Gemini 1.5 Flash-8B is a small model designed for lower intelligence tasks. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-flash-8b save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 3,600 Maximum video length 1 hour Maximum audio length Approximately 9.5 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-flash-8b-latest Latest stable: gemini-1.5-flash-8b Stable: gemini-1.5-flash-8b-001 calendar_month Latest update October 2024 Gemini 1.5 Pro Try Gemini 2.5 Pro Preview , our most advanced Gemini model to date. Gemini 1.5 Pro is a mid-size multimodal model that is optimized for a wide-range of reasoning tasks. 1.5 Pro can process large amounts of data at once, including 2 hours of video, 19 hours of audio, codebases with 60,000 lines of code, or 2,000 pages of text. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-pro save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 2,097,152 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 7,200 Maximum video length 2 hours Maximum audio length Approximately 19 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-pro-latest Latest stable: gemini-1.5-pro Stable: gemini-1.5-pro-001 gemini-1.5-pro-002 calendar_month Latest update September 2024 Imagen 4 Imagen 4 is our latest image model, capable of generating highly detailed images with rich lighting, significantly better text rendering, and higher resolution output than previous models. Model details Property Description id_card Model code Gemini API imagen-4.0-generate-preview-06-06 imagen-4.0-ultra-generate-preview-06-06 save Supported data types Input Text Output Images token_auto Token limits [*] Input token limit 480 tokens (text) Output images 1 (Ultra) 1 to 4 (Standard) calendar_month Latest update June 2025 Imagen 3 Imagen 3 is our highest quality text-to-image model, capable of generating images with even better detail, richer lighting and fewer distracting artifacts than our previous models. Model details Property Description id_card Model code Gemini API imagen-3.0-generate-002 save Supported data types Input Text Output Images token_auto Token limits [*] Input token limit N/A Output images Up to 4 calendar_month Latest update February 2025 Veo 2 Veo 2 is our high quality text- and image-to-video model, capable of generating detailed videos, capturing the artistic nuance in your prompts. Model details Property Description id_card Model code Gemini API veo-2.0-generate-001 save Supported data types Input Text, image Output Video token_auto Limits Text input N/A Image input Any image resolution and aspect ratio up to 20MB file size Output video Up to 2 calendar_month Latest update April 2025 Gemini 2.5 Flash Live The Gemini 2.5 Flash Live model works with the Live API to enable low-latency bidirectional voice and video interactions with Gemini. The model can process text, audio, and video input, and it can provide text and audio output. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-live-2.5-flash-preview save Supported data types Inputs Audio, video, and text Output Text, and audio token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Supported Thinking Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-live-2.5-flash-preview calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.0 Flash Live The Gemini 2.0 Flash Live model works with the Live API to enable low-latency bidirectional voice and video interactions with Gemini. The model can process text, audio, and video input, and it can provide text and audio output. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-live-001 save Supported data types Inputs Audio, video, and text Output Text, and audio token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Supported Thinking Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.0-flash-live-001 calendar_month Latest update April 2025 cognition_2 Knowledge cutoff August 2024 Gemini Embedding Experimental Gemini embedding achieves a SOTA performance across many key dimensions including code, multi-lingual, and retrieval. Gemini Embedding rate limits are more restricted since it is an experimental model. Model details Property Description id_card Model code Gemini API gemini-embedding-exp-03-07 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 8,192 Output dimension size Elastic, supports: 3072, 1536, or 768 calendar_month Latest update March 2025 Text Embedding and Embedding Text Embedding Try our new experimental Gemini embedding model which achieves state-of-the-art performance. Text embeddings are used to measure the relatedness of strings and are widely used in many AI applications. text-embedding-004 achieves a stronger retrieval performance and outperforms existing models with comparable dimensions, on the standard MTEB embedding benchmarks. Model details Property Description id_card Model code Gemini API models/text-embedding-004 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 2,048 Output dimension size 768 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Not supported calendar_month Latest update April 2024 Embedding Note: Text Embedding is the newer version of the Embedding model. If you're creating a new project, use Text Embedding. You can use the Embedding model to generate text embeddings for input text. The Embedding model is optimized for creating embeddings with 768 dimensions for text of up to 2,048 tokens. Embedding model details Property Description id_card Model code models/embedding-001 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 2,048 Output dimension size 768 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Not supported calendar_month Latest update December 2023 AQA You can use the AQA model to perform Attributed Question-Answering (AQA)–related tasks over a document, corpus, or a set of passages. The AQA model returns answers to questions that are grounded in provided sources, along with estimating answerable probability. Model details Property Description id_card Model code models/aqa save Supported data types Input Text Output Text language Supported language English token_auto Token limits [*] Input token limit 7,168 Output token limit 1,024 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Supported calendar_month Latest update December 2023 See the examples to explore the capabilities of these model variations. [*] A token is equivalent to about 4 characters for Gemini models. 100 tokens are about 60-80 English words. Model version name patterns Gemini models are available in either stable , preview , or experimental versions. In your code, you can use one of the following model name formats to specify which model and version you want to use. Latest stable Points to the most recent stable version released for the specified model generation and variation. To specify the latest stable version, use the following pattern: -- . For example, gemini-2.0-flash . Stable Points to a specific stable model. Stable models usually don't change. Most production apps should use a specific stable model. To specify a stable version, use the following pattern: --- . For example, gemini-2.0-flash-001 . Preview Points to a preview model which may not be suitable for production use, come with more restrictive rate limits, but may have billing enabled. To specify a preview version, use the following pattern: --- . For example, gemini-2.5-pro-preview-06-05 . Experimental Points to an experimental model which may not be suitable for production use and come with more restrictive rate limits. We release experimental models to gather feedback and get our latest updates into the hands of developers quickly. To specify an experimental version, use the following pattern: --- . For example, gemini-2.0-pro-exp-02-05 . Experimental models In addition to stable models, the Gemini API offers experimental models which may not be suitable for production use and come with more restrictive rate limits. We release experimental models to gather feedback, get our latest updates into the hands of developers quickly, and highlight the pace of innovation happening at Google. What we learn from experimental launches informs how we release models more widely. An experimental model can be swapped for another without prior notice. We don't guarantee that an experimental model will become a stable model in the future. Previous experimental models As new versions or stable releases become available, we remove and replace experimental models. You can find the previous experimental models we released in the following section along with the replacement version: Model code Base model Replacement version gemini-2.5-flash-preview-04-17 Gemini 2.5 Flash gemini-2.5-flash-preview-05-20 gemini-2.0-flash-exp-image-generation Gemini 2.0 Flash gemini-2.0-flash-preview-image-generation gemini-2.5-pro-preview-06-05 Gemini 2.5 Pro gemini-2.5-pro gemini-2.5-pro-preview-05-06 Gemini 2.5 Pro gemini-2.5-pro gemini-2.5-pro-preview-03-25 Gemini 2.5 Pro gemini-2.5-pro gemini-2.0-flash-thinking-exp-01-21 Gemini 2.5 Flash gemini-2.5-flash-preview-04-17 gemini-2.0-pro-exp-02-05 Gemini 2.0 Pro Experimental gemini-2.5-pro-preview-03-25 gemini-2.0-flash-exp Gemini 2.0 Flash gemini-2.0-flash gemini-exp-1206 Gemini 2.0 Pro gemini-2.0-pro-exp-02-05 gemini-2.0-flash-thinking-exp-1219 Gemini 2.0 Flash Thinking gemini-2.0-flash-thinking-exp-01-21 gemini-exp-1121 Gemini gemini-exp-1206 gemini-exp-1114 Gemini gemini-exp-1206 gemini-1.5-pro-exp-0827 Gemini 1.5 Pro gemini-exp-1206 gemini-1.5-pro-exp-0801 Gemini 1.5 Pro gemini-exp-1206 gemini-1.5-flash-8b-exp-0924 Gemini 1.5 Flash-8B gemini-1.5-flash-8b gemini-1.5-flash-8b-exp-0827 Gemini 1.5 Flash-8B gemini-1.5-flash-8b Supported languages Gemini models are trained to work with the following languages: Arabic ( ar ) Bengali ( bn ) Bulgarian ( bg ) Chinese simplified and traditional ( zh ) Croatian ( hr ) Czech ( cs ) Danish ( da ) Dutch ( nl ) English ( en ) Estonian ( et ) Finnish ( fi ) French ( fr ) German ( de ) Greek ( el ) Hebrew ( iw ) Hindi ( hi ) Hungarian ( hu ) Indonesian ( id ) Italian ( it ) Japanese ( ja ) Korean ( ko ) Latvian ( lv ) Lithuanian ( lt ) Norwegian ( no ) Polish ( pl ) Portuguese ( pt ) Romanian ( ro ) Russian ( ru ) Serbian ( sr ) Slovak ( sk ) Slovenian ( sl ) Spanish ( es ) Swahili ( sw ) Swedish ( sv ) Thai ( th ) Turkish ( tr ) Ukrainian ( uk ) Vietnamese ( vi ) Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-07 UTC. \ No newline at end of file diff --git a/text_content/docs_models_a578e84d.txt b/text_content/docs_models_a578e84d.txt new file mode 100644 index 0000000000000000000000000000000000000000..fcc8e577ec10ca7028d0d48595ada2ce89356315 --- /dev/null +++ b/text_content/docs_models_a578e84d.txt @@ -0,0 +1,5 @@ +URL: https://ai.google.dev/gemini-api/docs/models#gemini-embedding +Title: Gemini models  |  Gemini API  |  Google AI for Developers +================================================== + +Gemini models | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Gemini models 2.5 Pro spark Our most powerful thinking model with maximum response accuracy and state-of-the-art performance Input audio, images, video, and text, get text responses Tackle difficult problems, analyze large databases, and more Best for complex coding, reasoning, and multimodal understanding 2.5 Flash spark Our best model in terms of price-performance, offering well-rounded capabilities. Input audio, images, video, and text, and get text responses Model thinks as needed; or, you can configure a thinking budget Best for low latency, high volume tasks that require thinking 2.5 Flash-Lite experiment A Gemini 2.5 Flash model optimized for cost efficiency and low latency. Input audio, images, video, and text, and get text responses Most cost-efficient model supporting high throughput Best for real time, low latency use cases Note: Gemini 2.5 Pro and 2.5 Flash come with thinking on by default . If you're migrating from a non-thinking model such as 2.0 Pro or Flash, we recommend you to review the Thinking guide first. Model variants The Gemini API offers different models that are optimized for specific use cases. Here's a brief overview of Gemini variants that are available: Model variant Input(s) Output Optimized for Gemini 2.5 Pro gemini-2.5-pro Audio, images, videos, text, and PDF Text Enhanced thinking and reasoning, multimodal understanding, advanced coding, and more Gemini 2.5 Flash gemini-2.5-flash Audio, images, videos, and text Text Adaptive thinking, cost efficiency Gemini 2.5 Flash-Lite Preview gemini-2.5-flash-lite-preview-06-17 Text, image, video, audio Text Most cost-efficient model supporting high throughput Gemini 2.5 Flash Native Audio gemini-2.5-flash-preview-native-audio-dialog & gemini-2.5-flash-exp-native-audio-thinking-dialog Audio, videos, and text Text and audio, interleaved High quality, natural conversational audio outputs, with or without thinking Gemini 2.5 Flash Preview TTS gemini-2.5-flash-preview-tts Text Audio Low latency, controllable, single- and multi-speaker text-to-speech audio generation Gemini 2.5 Pro Preview TTS gemini-2.5-pro-preview-tts Text Audio Low latency, controllable, single- and multi-speaker text-to-speech audio generation Gemini 2.0 Flash gemini-2.0-flash Audio, images, videos, and text Text Next generation features, speed, and realtime streaming. Gemini 2.0 Flash Preview Image Generation gemini-2.0-flash-preview-image-generation Audio, images, videos, and text Text, images Conversational image generation and editing Gemini 2.0 Flash-Lite gemini-2.0-flash-lite Audio, images, videos, and text Text Cost efficiency and low latency Gemini 1.5 Flash gemini-1.5-flash Audio, images, videos, and text Text Fast and versatile performance across a diverse variety of tasks Gemini 1.5 Flash-8B gemini-1.5-flash-8b Audio, images, videos, and text Text High volume and lower intelligence tasks Gemini 1.5 Pro gemini-1.5-pro Audio, images, videos, and text Text Complex reasoning tasks requiring more intelligence Gemini Embedding gemini-embedding-exp Text Text embeddings Measuring the relatedness of text strings Imagen 4 imagen-4.0-generate-preview-06-06 imagen-4.0-ultra-generate-preview-06-06 Text Images Our most up-to-date image generation model Imagen 3 imagen-3.0-generate-002 Text Images High quality image generation model Veo 2 veo-2.0-generate-001 Text, images Video High quality video generation Gemini 2.5 Flash Live gemini-live-2.5-flash-preview Audio, video, and text Text, audio Low-latency bidirectional voice and video interactions Gemini 2.0 Flash Live gemini-2.0-flash-live-001 Audio, video, and text Text, audio Low-latency bidirectional voice and video interactions You can view the rate limits for each model on the rate limits page . Gemini 2.5 Pro Gemini 2.5 Pro is our state-of-the-art thinking model, capable of reasoning over complex problems in code, math, and STEM, as well as analyzing large datasets, codebases, and documents using long context. Try in Google AI Studio Model details Property Description id_card Model code gemini-2.5-pro save Supported data types Inputs Audio, images, video, text, and PDF Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 65,536 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Search grounding Supported Image generation Not supported Audio generation Not supported Live API Not supported Thinking Supported Batch API Supported 123 Versions Read the model version patterns for more details. Stable: gemini-2.5-pro Preview: gemini-2.5-pro-preview-06-05 Preview: gemini-2.5-pro-preview-05-06 Preview: gemini-2.5-pro-preview-03-25 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Our best model in terms of price-performance, offering well-rounded capabilities. 2.5 Flash is best for large scale processing, low-latency, high volume tasks that require thinking, and agentic use cases. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash save Supported data types Inputs Text, images, video, audio Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 65,536 handyman Capabilities Audio generation Not supported Caching Supported Code execution Supported Function calling Supported Image generation Not supported Search grounding Supported Structured outputs Supported Thinking Supported Tuning Not supported Batch API Supported 123 Versions Read the model version patterns for more details. Stable: gemini-2.5-flash Preview: gemini-2.5-flash-preview-05-20 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash-Lite Preview A Gemini 2.5 Flash model optimized for cost efficiency and low latency. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-lite-preview-06-17 save Supported data types Inputs Text, images, video, and audio Output Text token_auto Token limits [*] Input token limit 1,000,000 Output token limit 64,000 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported URL Context Supported Search grounding Supported Image generation Not supported Audio generation Not supported Live API Not supported Thinking Supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.5-flash-lite-preview-06-17 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Native Audio Our native audio dialog models, with and without thinking, available through the Live API . These models provide interactive and unstructured conversational experiences, with style and control prompting. Try native audio in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-preview-native-audio-dialog & models/gemini-2.5-flash-exp-native-audio-thinking-dialog save Supported data types Inputs Audio, video, text Output Audio and text token_auto Token limits [*] Input token limit 128,000 Output token limit 8,000 handyman Capabilities Audio generation Supported Caching Not supported Code execution Not supported Function calling Supported Image generation Not supported Search grounding Supported Structured outputs Not supported Thinking Supported Tuning Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.5-flash-preview-05-20 Experimental: gemini-2.5-flash-exp-native-audio-thinking-dialog calendar_month Latest update May 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Preview Text-to-Speech Gemini 2.5 Flash Preview TTS is our price-performant text-to-speech model, delivering high control and transparency for structured workflows like podcast generation, audiobooks, customer support, and more. Gemini 2.5 Flash rate limits are more restricted since it is an experimental / preview model. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-preview-tts save Supported data types Inputs Text Output Audio token_auto Token limits [*] Input token limit 8,000 Output token limit 16,000 handyman Capabilities Structured outputs Not supported Caching Not supported Tuning Not supported Function calling Not supported Code execution Not supported Search Not supported Audio generation Supported Live API Not supported Thinking Not supported 123 Versions Read the model version patterns for more details. gemini-2.5-flash-preview-tts calendar_month Latest update May 2025 Gemini 2.5 Pro Preview Text-to-Speech Gemini 2.5 Pro Preview TTS is our most powerful text-to-speech model, delivering high control and transparency for structured workflows like podcast generation, audiobooks, customer support, and more. Gemini 2.5 Pro rate limits are more restricted since it is an experimental / preview model. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-pro-preview-tts save Supported data types Inputs Text Output Audio token_auto Token limits [*] Input token limit 8,000 Output token limit 16,000 handyman Capabilities Structured outputs Not supported Caching Not supported Tuning Not supported Function calling Not supported Code execution Not supported Search Not supported Audio generation Supported Live API Not supported Thinking Not supported 123 Versions Read the model version patterns for more details. gemini-2.5-pro-preview-tts calendar_month Latest update May 2025 Gemini 2.0 Flash Gemini 2.0 Flash delivers next-gen features and improved capabilities, including superior speed, native tool use, and a 1M token context window. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Not supported Live API Supported Thinking Experimental Batch API Supported 123 Versions Read the model version patterns for more details. Latest: gemini-2.0-flash Stable: gemini-2.0-flash-001 Experimental: gemini-2.0-flash-exp calendar_month Latest update February 2025 cognition_2 Knowledge cutoff August 2024 Gemini 2.0 Flash Preview Image Generation Gemini 2.0 Flash Preview Image Generation delivers improved image generation features, including generating and editing images conversationally. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-preview-image-generation save Supported data types Inputs Audio, images, video, and text Output Text and images token_auto Token limits [*] Input token limit 32,000 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Not supported Code execution Not Supported Search Not Supported Image generation Supported Audio generation Not supported Live API Not Supported Thinking Not Supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.0-flash-preview-image-generation gemini-2.0-flash-preview-image-generation is not currently supported in a number of countries in Europe, Middle East & Africa calendar_month Latest update May 2025 cognition_2 Knowledge cutoff August 2024 Gemini 2.0 Flash-Lite A Gemini 2.0 Flash model optimized for cost efficiency and low latency. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-lite save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Not supported Search Not supported Image generation Not supported Audio generation Not supported Live API Not supported Batch API Supported 123 Versions Read the model version patterns for more details. Latest: gemini-2.0-flash-lite Stable: gemini-2.0-flash-lite-001 calendar_month Latest update February 2025 cognition_2 Knowledge cutoff August 2024 Gemini 1.5 Flash Gemini 1.5 Flash is a fast and versatile multimodal model for scaling across diverse tasks. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-flash save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 3,600 Maximum video length 1 hour Maximum audio length Approximately 9.5 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-flash-latest Latest stable: gemini-1.5-flash Stable: gemini-1.5-flash-001 gemini-1.5-flash-002 calendar_month Latest update September 2024 Gemini 1.5 Flash-8B Gemini 1.5 Flash-8B is a small model designed for lower intelligence tasks. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-flash-8b save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 3,600 Maximum video length 1 hour Maximum audio length Approximately 9.5 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-flash-8b-latest Latest stable: gemini-1.5-flash-8b Stable: gemini-1.5-flash-8b-001 calendar_month Latest update October 2024 Gemini 1.5 Pro Try Gemini 2.5 Pro Preview , our most advanced Gemini model to date. Gemini 1.5 Pro is a mid-size multimodal model that is optimized for a wide-range of reasoning tasks. 1.5 Pro can process large amounts of data at once, including 2 hours of video, 19 hours of audio, codebases with 60,000 lines of code, or 2,000 pages of text. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-pro save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 2,097,152 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 7,200 Maximum video length 2 hours Maximum audio length Approximately 19 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-pro-latest Latest stable: gemini-1.5-pro Stable: gemini-1.5-pro-001 gemini-1.5-pro-002 calendar_month Latest update September 2024 Imagen 4 Imagen 4 is our latest image model, capable of generating highly detailed images with rich lighting, significantly better text rendering, and higher resolution output than previous models. Model details Property Description id_card Model code Gemini API imagen-4.0-generate-preview-06-06 imagen-4.0-ultra-generate-preview-06-06 save Supported data types Input Text Output Images token_auto Token limits [*] Input token limit 480 tokens (text) Output images 1 (Ultra) 1 to 4 (Standard) calendar_month Latest update June 2025 Imagen 3 Imagen 3 is our highest quality text-to-image model, capable of generating images with even better detail, richer lighting and fewer distracting artifacts than our previous models. Model details Property Description id_card Model code Gemini API imagen-3.0-generate-002 save Supported data types Input Text Output Images token_auto Token limits [*] Input token limit N/A Output images Up to 4 calendar_month Latest update February 2025 Veo 2 Veo 2 is our high quality text- and image-to-video model, capable of generating detailed videos, capturing the artistic nuance in your prompts. Model details Property Description id_card Model code Gemini API veo-2.0-generate-001 save Supported data types Input Text, image Output Video token_auto Limits Text input N/A Image input Any image resolution and aspect ratio up to 20MB file size Output video Up to 2 calendar_month Latest update April 2025 Gemini 2.5 Flash Live The Gemini 2.5 Flash Live model works with the Live API to enable low-latency bidirectional voice and video interactions with Gemini. The model can process text, audio, and video input, and it can provide text and audio output. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-live-2.5-flash-preview save Supported data types Inputs Audio, video, and text Output Text, and audio token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Supported Thinking Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-live-2.5-flash-preview calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.0 Flash Live The Gemini 2.0 Flash Live model works with the Live API to enable low-latency bidirectional voice and video interactions with Gemini. The model can process text, audio, and video input, and it can provide text and audio output. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-live-001 save Supported data types Inputs Audio, video, and text Output Text, and audio token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Supported Thinking Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.0-flash-live-001 calendar_month Latest update April 2025 cognition_2 Knowledge cutoff August 2024 Gemini Embedding Experimental Gemini embedding achieves a SOTA performance across many key dimensions including code, multi-lingual, and retrieval. Gemini Embedding rate limits are more restricted since it is an experimental model. Model details Property Description id_card Model code Gemini API gemini-embedding-exp-03-07 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 8,192 Output dimension size Elastic, supports: 3072, 1536, or 768 calendar_month Latest update March 2025 Text Embedding and Embedding Text Embedding Try our new experimental Gemini embedding model which achieves state-of-the-art performance. Text embeddings are used to measure the relatedness of strings and are widely used in many AI applications. text-embedding-004 achieves a stronger retrieval performance and outperforms existing models with comparable dimensions, on the standard MTEB embedding benchmarks. Model details Property Description id_card Model code Gemini API models/text-embedding-004 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 2,048 Output dimension size 768 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Not supported calendar_month Latest update April 2024 Embedding Note: Text Embedding is the newer version of the Embedding model. If you're creating a new project, use Text Embedding. You can use the Embedding model to generate text embeddings for input text. The Embedding model is optimized for creating embeddings with 768 dimensions for text of up to 2,048 tokens. Embedding model details Property Description id_card Model code models/embedding-001 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 2,048 Output dimension size 768 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Not supported calendar_month Latest update December 2023 AQA You can use the AQA model to perform Attributed Question-Answering (AQA)–related tasks over a document, corpus, or a set of passages. The AQA model returns answers to questions that are grounded in provided sources, along with estimating answerable probability. Model details Property Description id_card Model code models/aqa save Supported data types Input Text Output Text language Supported language English token_auto Token limits [*] Input token limit 7,168 Output token limit 1,024 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Supported calendar_month Latest update December 2023 See the examples to explore the capabilities of these model variations. [*] A token is equivalent to about 4 characters for Gemini models. 100 tokens are about 60-80 English words. Model version name patterns Gemini models are available in either stable , preview , or experimental versions. In your code, you can use one of the following model name formats to specify which model and version you want to use. Latest stable Points to the most recent stable version released for the specified model generation and variation. To specify the latest stable version, use the following pattern: -- . For example, gemini-2.0-flash . Stable Points to a specific stable model. Stable models usually don't change. Most production apps should use a specific stable model. To specify a stable version, use the following pattern: --- . For example, gemini-2.0-flash-001 . Preview Points to a preview model which may not be suitable for production use, come with more restrictive rate limits, but may have billing enabled. To specify a preview version, use the following pattern: --- . For example, gemini-2.5-pro-preview-06-05 . Experimental Points to an experimental model which may not be suitable for production use and come with more restrictive rate limits. We release experimental models to gather feedback and get our latest updates into the hands of developers quickly. To specify an experimental version, use the following pattern: --- . For example, gemini-2.0-pro-exp-02-05 . Experimental models In addition to stable models, the Gemini API offers experimental models which may not be suitable for production use and come with more restrictive rate limits. We release experimental models to gather feedback, get our latest updates into the hands of developers quickly, and highlight the pace of innovation happening at Google. What we learn from experimental launches informs how we release models more widely. An experimental model can be swapped for another without prior notice. We don't guarantee that an experimental model will become a stable model in the future. Previous experimental models As new versions or stable releases become available, we remove and replace experimental models. You can find the previous experimental models we released in the following section along with the replacement version: Model code Base model Replacement version gemini-2.5-flash-preview-04-17 Gemini 2.5 Flash gemini-2.5-flash-preview-05-20 gemini-2.0-flash-exp-image-generation Gemini 2.0 Flash gemini-2.0-flash-preview-image-generation gemini-2.5-pro-preview-06-05 Gemini 2.5 Pro gemini-2.5-pro gemini-2.5-pro-preview-05-06 Gemini 2.5 Pro gemini-2.5-pro gemini-2.5-pro-preview-03-25 Gemini 2.5 Pro gemini-2.5-pro gemini-2.0-flash-thinking-exp-01-21 Gemini 2.5 Flash gemini-2.5-flash-preview-04-17 gemini-2.0-pro-exp-02-05 Gemini 2.0 Pro Experimental gemini-2.5-pro-preview-03-25 gemini-2.0-flash-exp Gemini 2.0 Flash gemini-2.0-flash gemini-exp-1206 Gemini 2.0 Pro gemini-2.0-pro-exp-02-05 gemini-2.0-flash-thinking-exp-1219 Gemini 2.0 Flash Thinking gemini-2.0-flash-thinking-exp-01-21 gemini-exp-1121 Gemini gemini-exp-1206 gemini-exp-1114 Gemini gemini-exp-1206 gemini-1.5-pro-exp-0827 Gemini 1.5 Pro gemini-exp-1206 gemini-1.5-pro-exp-0801 Gemini 1.5 Pro gemini-exp-1206 gemini-1.5-flash-8b-exp-0924 Gemini 1.5 Flash-8B gemini-1.5-flash-8b gemini-1.5-flash-8b-exp-0827 Gemini 1.5 Flash-8B gemini-1.5-flash-8b Supported languages Gemini models are trained to work with the following languages: Arabic ( ar ) Bengali ( bn ) Bulgarian ( bg ) Chinese simplified and traditional ( zh ) Croatian ( hr ) Czech ( cs ) Danish ( da ) Dutch ( nl ) English ( en ) Estonian ( et ) Finnish ( fi ) French ( fr ) German ( de ) Greek ( el ) Hebrew ( iw ) Hindi ( hi ) Hungarian ( hu ) Indonesian ( id ) Italian ( it ) Japanese ( ja ) Korean ( ko ) Latvian ( lv ) Lithuanian ( lt ) Norwegian ( no ) Polish ( pl ) Portuguese ( pt ) Romanian ( ro ) Russian ( ru ) Serbian ( sr ) Slovak ( sk ) Slovenian ( sl ) Spanish ( es ) Swahili ( sw ) Swedish ( sv ) Thai ( th ) Turkish ( tr ) Ukrainian ( uk ) Vietnamese ( vi ) Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-07 UTC. \ No newline at end of file diff --git a/text_content/docs_prompting-intro_bb8b36ca.txt b/text_content/docs_prompting-intro_bb8b36ca.txt new file mode 100644 index 0000000000000000000000000000000000000000..eef160b891047083d778d2b2c5eb819ce00f1763 --- /dev/null +++ b/text_content/docs_prompting-intro_bb8b36ca.txt @@ -0,0 +1,5 @@ +URL: https://ai.google.dev/gemini-api/docs/prompting-intro#completion +Title: Prompt design strategies  |  Gemini API  |  Google AI for Developers +================================================== + +Prompt design strategies | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Prompt design strategies Prompt design is the process of creating prompts, or natural language requests, that elicit accurate, high quality responses from a language model. This page introduces basic concepts, strategies, and best practices to get you started designing prompts to get the most out of Gemini AI models. Topic-specific prompt guides Looking for more specific prompt strategies? Check out our other prompting guides on: Prompting with media files Prompting for image generation Prompting for video generation Google AI Studio also hosts a prompt gallery meant to interactively showcase many of the concepts shared in this guide. Clear and specific instructions An effective and efficient way to customize model behavior is to provide it with clear and specific instructions. Instructions can be in the form of a question, step-by-step tasks, or as complex as mapping out a user's experience and mindset. Input Input is the required text in the prompt that you want the model to provide a response to. Inputs can be a question that the model answers (question input), a task the model performs (task input), an entity the model operates on (entity input), or partial input that the model completes or continues (completion input). Input type Prompt Generated output Question What's a good name for a flower shop that specializes in selling bouquets of dried flowers? Create a list of 5 options with just the names. Here are 10 names for a flower shop specializing in dried flowers: 1. Everlasting Blooms 2. Dried & Delightful 3. The Petal Preserve 4. Whispers of Wildflowers 5. Forever Blooms Task Give me a simple list of just the things that I must bring on a camping trip. The list should have 5 items. 1. **Tent:** Your shelter from the elements. 2. **Sleeping Bag:** Stay warm and comfortable. 3. **Sleeping Pad:** Insulation and cushioning for a good night's sleep. 4. **Headlamp or Flashlight:** Light for nighttime activities. 5. **First Aid Kit:** Be prepared for minor injuries. Entity Classify the following items as [large, small]: Elephant Mouse Snail Here's the classification: * **Elephant:** Large * **Mouse:** Small * **Snail:** Small Partial input completion Generative language models work like an advanced auto completion tool. When you provide partial content, the model can provide the rest of the content or what it thinks is a continuation of that content as a response. When doing so, if you include any examples or context, the model can take those examples or context into account. The following example provides a prompt with an instruction and an entity input: Prompt: For the given order, return a JSON object that has the fields cheeseburger, hamburger, fries, or drink, with the value being the quantity. Order: A burger and a drink. Response: { "cheeseburger": 0, "hamburger": 1, "fries": 0, "drink": 1 } (gemini-2.5-flash) While the model did as prompted, writing out the instructions in natural language can sometimes be challenging and it leaves a lot to the model's interpretation. For example, a restaurants menu might contain many items. To reduce the size of the JSON response, you probably want to omit the items that weren't ordered. In this case, you can give an example and a response prefix and let the model complete it: Prompt: Valid fields are cheeseburger, hamburger, fries, and drink. Order: Give me a cheeseburger and fries Output: ``` { "cheeseburger": 1, "fries": 1 } ``` Order: I want two burgers, a drink, and fries. Output: Response: ``` { "hamburger": 2, "drink": 1, "fries": 1 } ``` (gemini-2.5-flash) Notice how "cheeseburger" was excluded from the output because it wasn't a part of the order. Constraints Specify any constraints on reading the prompt or generating a response. You can tell the model what to do and not to do. For example, you can specify a constraint in the prompt on how long you want a summary to be: Prompt: Summarize this text in one sentence: Text: A quantum computer exploits quantum mechanical phenomena to perform calculations exponentially faster than any modern traditional computer. At very tiny scales, physical matter acts as both particles and as waves, and quantum computing uses specialized hardware to leverage this behavior. The operating principles of quantum devices is beyond the scope of classical physics. When deployed at scale, quantum computers could be used in a wide variety of applications such as: in cybersecurity to break existing encryption methods while helping researchers create new ones, in meteorology to develop better weather forecasting etc. However, the current state of the art quantum computers are still largely experimental and impractical. Response: Exploiting quantum mechanical phenomena, quantum computers can perform calculations exponentially faster than traditional computers for potential applications like cybersecurity and meteorology, although they are currently largely experimental and impractical. (gemini-2.5-flash) Response format You can give instructions that specify the format of the response. For example, you can ask for the response to be formatted as a table, bulleted list, elevator pitch, keywords, sentence, or paragraph. The following system instruction tells the model to be more conversational in its response: System instruction All questions should be answered comprehensively with details, unless the user requests a concise response specifically. Prompt What is a smart way to make a business that sells DVD's in 2025? Response: Starting a business selling DVDs in 2025 presents significant challenges due to the overwhelming dominance of digital streaming and on-demand services; a "smart" approach would therefore require a highly specialized niche, a strong value proposition beyond just the disc, and likely a focus on community or collector markets rather than mainstream consumption. Here's a breakdown of what a smart DVD business in 2025 would likely entail: 1. Extreme Niche Specialization: You cannot compete with streaming libraries. Your business must focus on what digital doesn't offer easily or at all. This could include: * Out-of-Print & Rare Titles: Sourcing films or TV shows that have never been released digitally or are currently unavailable on major streaming platforms. .... (gemini-2.5-flash) Format responses with the completion strategy The completion strategy can also help format the response. The following example prompts the model to create an essay outline: Prompt: Create an outline for an essay about hummingbirds. Response: Here is a possible outline for an essay about hummingbirds: I. Introduction A. Hook: Start with a captivating fact or description about hummingbirds (e.g., their speed, iridescent colors, or tiny size). B. Background: Briefly introduce hummingbirds – where they are found (Americas), their reputation as unique birds. C. Thesis Statement: State the main point of the essay, which could be about the remarkable adaptations, behaviors, and ecological importance of hummingbirds. II. Appearance and Identification ..... (gemini-2.5-flash) The prompt didn't specify the format for the outline and the model chose a format for you. To get the model to return an outline in a specific format, you can add text that represents the start of the outline and let the model complete it based on the pattern that you initiated. Prompt: Create an outline for an essay about hummingbirds. I. Introduction * Response: Okay, here is an outline for an essay about hummingbirds, starting from the introduction you provided: I. Introduction * Hook: Start with a captivating fact or description about hummingbirds (e.g., their speed, iridescent colors, or tiny size). * Background: Briefly introduce hummingbirds – where they are found (Americas), their reputation as unique birds, and their general characteristics. * Thesis Statement: State the main point of the essay, focusing on the remarkable adaptations, behaviors, and ecological significance that make hummingbirds extraordinary. ..... (gemini-2.5-flash) Zero-shot vs few-shot prompts You can include examples in the prompt that show the model what getting it right looks like. The model attempts to identify patterns and relationships from the examples and applies them when generating a response. Prompts that contain a few examples are called few-shot prompts, while prompts that provide no examples are called zero-shot prompts. Few-shot prompts are often used to regulate the formatting, phrasing, scoping, or general patterning of model responses. Use specific and varied examples to help the model narrow its focus and generate more accurate results. We recommend to always include few-shot examples in your prompts. Prompts without few-shot examples are likely to be less effective. In fact, you can remove instructions from your prompt if your examples are clear enough in showing the task at hand. The following zero-shot prompt asks the model to choose the best explanation. Prompt: Please choose the best explanation to the question: Question: How is snow formed? Explanation1: Snow is formed when water vapor in the air freezes into ice crystals in the atmosphere, which can combine and grow into snowflakes as they fall through the atmosphere and accumulate on the ground. Explanation2: Water vapor freezes into ice crystals forming snow. Answer: Response: Explanation1 is the better explanation because it provides more detail on the process, including how ice crystals combine and grow into snowflakes as they fall through the atmosphere. (gemini-2.5-flash) If your use case requires the model to produce concise responses, you can include examples in the prompt that give preference to concise responses. The following prompt provides two examples that show preference to the shorter explanations. In the response, you can see that the examples guided the model to choose the shorter explanation ( Explanation2 ) as opposed to the longer explanation ( Explanation1 ) like it did previously. Prompt: Below are some examples showing a question, explanation, and answer format: Question: Why is the sky blue? Explanation1: The sky appears blue because of Rayleigh scattering, which causes shorter blue wavelengths of light to be scattered more easily than longer red wavelengths, making the sky look blue. Explanation2: Due to Rayleigh scattering effect. Answer: Explanation2 Question: What is the cause of earthquakes? Explanation1: Sudden release of energy in the Earth's crust. Explanation2: Earthquakes happen when tectonic plates suddenly slip or break apart, causing a release of energy that creates seismic waves that can shake the ground and cause damage. Answer: Explanation1 Now, Answer the following question given the example formats above: Question: How is snow formed? Explanation1: Snow is formed when water vapor in the air freezes into ice crystals in the atmosphere, which can combine and grow into snowflakes as they fall through the atmosphere and accumulate on the ground. Explanation2: Water vapor freezes into ice crystals forming snow. Answer: Response: Answer: Explanation2 (gemini-2.5-flash) Optimal number of examples Models like Gemini can often pick up on patterns using a few examples, though you may need to experiment with the number of examples to provide in the prompt for the best results. At the same time, if you include too many examples, the model may start to overfit the response to the examples. Patterns vs anti patterns Using examples to show the model a pattern to follow is more effective than using examples to show the model an anti pattern to avoid. ⛔ Negative pattern: Example Don't end haikus with a question: Haiku are fun A short and simple poem Don't you enjoy them? ✅ Positive pattern: Example Always end haikus with an assertion: Haiku are fun A short and simple poem A joy to write Consistent formatting Make sure that the structure and formatting of few-shot examples are the same to avoid responses with undesired formats. One of the primary objectives of adding few-shot examples in prompts is to show the model the response format. Therefore, it is essential to ensure a consistent format across all examples, especially paying attention to XML tags, white spaces, newlines, and example splitters. Add context You can include instructions and information in a prompt that the model needs to solve a problem, instead of assuming that the model has all of the required information. This contextual information helps the model understand the constraints and details of what you're asking for it to do. The following example asks the model to give troubleshooting guidance for a router: Prompt: What should I do to fix my disconnected wifi? The light on my Google Wifi router is yellow and blinking slowly. Response: A yellow blinking light on your Google Wifi router usually indicates a problem with the internet connection. Here are some steps to troubleshoot and fix the issue: 1. Restart your router and modem: - Unplug your modem from the power outlet and wait for 30 seconds. - Plug your modem back in and wait for it to fully boot up. - Unplug your Google Wifi router and wait for 30 seconds. - Plug your Google Wifi router back in and wait for it to fully boot up. ..... (gemini-2.5-flash) The response looks like generic troubleshooting information that's not specific to the router or the status of the LED indicator lights. To customize the response for the specific router, you can add to the prompt the router's troubleshooting guide as context for it to refer to when providing a response. Prompt: Answer the question using the text below. Respond with only the text provided. Question: What should I do to fix my disconnected wifi? The light on my Google Wifi router is yellow and blinking slowly. Text: Color: Slowly pulsing yellow What it means: There is a network error. What to do: Check that the Ethernet cable is connected to both your router and your modem and both devices are turned on. You might need to unplug and plug in each device again. Color: Fast blinking yellow What it means: You are holding down the reset button and are factory resetting this device. What to do: If you keep holding down the reset button, after about 12 seconds, the light will turn solid yellow. Once it is solid yellow, let go of the factory reset button. Color: Solid yellow What it means: Router is factory resetting. What to do: This can take up to 10 minutes. When it's done, the device will reset itself and start pulsing white, letting you know it's ready for setup. Color: Solid red What it means: Something is wrong. What to do: Critical failure. Factory reset the router. If the light stays red, contact Wifi customer support. Response: Check that the Ethernet cable is connected to both your router and your modem and both devices are turned on. You might need to unplug and plug in each device again. (gemini-2.5-flash) Add prefixes A prefix is a word or phrase that you add to the prompt content that can serve several purposes, depending on where you put the prefix: Input prefix: Adding a prefix to the input signals semantically meaningful parts of the input to the model. For example, the prefixes "English:" and "French:" demarcate two different languages. Output prefix: Even though the output is generated by the model, you can add a prefix for the output in the prompt. The output prefix gives the model information about what's expected as a response. For example, the output prefix "JSON:" signals to the model that the output should be in JSON format. Example prefix: In few-shot prompts, adding prefixes to the examples provides labels that the model can use when generating the output, which makes it easier to parse output content. In the following example, "Text:" is the input prefix and "The answer is:" is the output prefix. Prompt: Classify the text as one of the following categories. - large - small Text: Rhino The answer is: large Text: Mouse The answer is: small Text: Snail The answer is: small Text: Elephant The answer is: Response: The answer is: large (gemini-2.5-flash) Break down prompts into components For use cases that require complex prompts, you can help the model manage this complexity by breaking things down into simpler components. Break down instructions: Instead of having many instructions in one prompt, create one prompt per instruction. You can choose which prompt to process based on the user's input. Chain prompts: For complex tasks that involve multiple sequential steps, make each step a prompt and chain the prompts together in a sequence. In this sequential chain of prompts, the output of one prompt in the sequence becomes the input of the next prompt. The output of the last prompt in the sequence is the final output. Aggregate responses: Aggregation is when you want to perform different parallel tasks on different portions of the data and aggregate the results to produce the final output. For example, you can tell the model to perform one operation on the first part of the data, perform another operation on the rest of the data and aggregate the results. Experiment with model parameters Each call that you send to a model includes parameter values that control how the model generates a response. The model can generate different results for different parameter values. Experiment with different parameter values to get the best values for the task. The parameters available for different models may differ. The most common parameters are the following: Max output tokens: Specifies the maximum number of tokens that can be generated in the response. A token is approximately four characters. 100 tokens correspond to roughly 60-80 words. Temperature: The temperature controls the degree of randomness in token selection. The temperature is used for sampling during response generation, which occurs when topP and topK are applied. Lower temperatures are good for prompts that require a more deterministic or less open-ended response, while higher temperatures can lead to more diverse or creative results. A temperature of 0 is deterministic, meaning that the highest probability response is always selected. topK : The topK parameter changes how the model selects tokens for output. A topK of 1 means the selected token is the most probable among all the tokens in the model's vocabulary (also called greedy decoding), while a topK of 3 means that the next token is selected from among the 3 most probable using the temperature. For each token selection step, the topK tokens with the highest probabilities are sampled. Tokens are then further filtered based on topP with the final token selected using temperature sampling. topP : The topP parameter changes how the model selects tokens for output. Tokens are selected from the most to least probable until the sum of their probabilities equals the topP value. For example, if tokens A, B, and C have a probability of 0.3, 0.2, and 0.1 and the topP value is 0.5, then the model will select either A or B as the next token by using the temperature and exclude C as a candidate. The default topP value is 0.95. stop_sequences : Set a stop sequence to tell the model to stop generating content. A stop sequence can be any sequence of characters. Try to avoid using a sequence of characters that may appear in the generated content. Prompt iteration strategies Prompt design can sometimes require a few iterations before you consistently get the response you're looking for. This section provides guidance on some things you can try when iterating on your prompts: Use different phrasing: Using different words or phrasing in your prompts often yields different responses from the model even though they all mean the same thing. If you're not getting the expected results from your prompt, try rephrasing it. Version 1: How do I bake a pie? Version 2: Suggest a recipe for a pie. Version 3: What's a good pie recipe? Switch to an analogous task: If you can't get the model to follow your instructions for a task, try giving it instructions for an analogous task that achieves the same result. This prompt tells the model to categorize a book by using predefined categories: Prompt: Which category does The Odyssey belong to: thriller sci-fi mythology biography Response: The Odyssey belongs to the category of **mythology**. Here's why: * **Mythology:** The Odyssey tells the story of Odysseus, a hero from Greek mythology, and his journey home after the Trojan War. It features gods, monsters, and supernatural events common to Greek mythology. ..... (gemini-2.5-flash) The response is correct, but the model didn't stay within the bounds of the options. You also want to model to just respond with one of the options instead of in a full sentence. In this case, you can rephrase the instructions as a multiple choice question and ask the model to choose an option. Prompt: Multiple choice problem: Which of the following options describes the book The Odyssey? Options: thriller sci-fi mythology biography Response: The correct answer is mythology . (gemini-2.5-flash) Change the order of prompt content: The order of the content in the prompt can sometimes affect the response. Try changing the content order and see how that affects the response. Version 1 : [ examples ] [ context ] [ input ] Version 2 : [ input ] [ examples ] [ context ] Version 3 : [ examples ] [ input ] [ context ] Fallback responses A fallback response is a response returned by the model when either the prompt or the response triggers a safety filter. An example of a fallback response is "I'm not able to help with that, as I'm only a language model." If the model responds with a fallback response, try increasing the temperature. Things to avoid Avoid relying on models to generate factual information. Use with care on math and logic problems. Generative models under the hood This section aims to answer the question - Is there randomness in generative models' responses, or are they deterministic? The short answer - yes to both. When you prompt a generative model, a text response is generated in two stages. In the first stage, the generative model processes the input prompt and generates a probability distribution over possible tokens (words) that are likely to come next. For example, if you prompt with the input text "The dog jumped over the ... ", the generative model will produce an array of probable next words: [("fence", 0.77), ("ledge", 0.12), ("blanket", 0.03), ...] This process is deterministic; a generative model will produce this same distribution every time it's input the same prompt text. In the second stage, the generative model converts these distributions into actual text responses through one of several decoding strategies. A simple decoding strategy might select the most likely token at every timestep. This process would always be deterministic. However, you could instead choose to generate a response by randomly sampling over the distribution returned by the model. This process would be stochastic (random). Control the degree of randomness allowed in this decoding process by setting the temperature. A temperature of 0 means only the most likely tokens are selected, and there's no randomness. Conversely, a high temperature injects a high degree of randomness into the tokens selected by the model, leading to more unexpected, surprising model responses. Next steps Now that you have a deeper understanding of prompt design, try writing your own prompts using Google AI Studio . To learn about multimodal prompting, see Prompting with media files . To learn about image prompting, see the Imagen prompt guide To learn about video prompting, see the Veo prompt guide Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-04-28 UTC. \ No newline at end of file diff --git a/text_content/docs_prompting-intro_f48a390e.txt b/text_content/docs_prompting-intro_f48a390e.txt new file mode 100644 index 0000000000000000000000000000000000000000..a9329ea84b9eddc9f0ed28f359a113be2130b71a --- /dev/null +++ b/text_content/docs_prompting-intro_f48a390e.txt @@ -0,0 +1,5 @@ +URL: https://ai.google.dev/gemini-api/docs/prompting-intro#prefixes +Title: Prompt design strategies  |  Gemini API  |  Google AI for Developers +================================================== + +Prompt design strategies | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Prompt design strategies Prompt design is the process of creating prompts, or natural language requests, that elicit accurate, high quality responses from a language model. This page introduces basic concepts, strategies, and best practices to get you started designing prompts to get the most out of Gemini AI models. Topic-specific prompt guides Looking for more specific prompt strategies? Check out our other prompting guides on: Prompting with media files Prompting for image generation Prompting for video generation Google AI Studio also hosts a prompt gallery meant to interactively showcase many of the concepts shared in this guide. Clear and specific instructions An effective and efficient way to customize model behavior is to provide it with clear and specific instructions. Instructions can be in the form of a question, step-by-step tasks, or as complex as mapping out a user's experience and mindset. Input Input is the required text in the prompt that you want the model to provide a response to. Inputs can be a question that the model answers (question input), a task the model performs (task input), an entity the model operates on (entity input), or partial input that the model completes or continues (completion input). Input type Prompt Generated output Question What's a good name for a flower shop that specializes in selling bouquets of dried flowers? Create a list of 5 options with just the names. Here are 10 names for a flower shop specializing in dried flowers: 1. Everlasting Blooms 2. Dried & Delightful 3. The Petal Preserve 4. Whispers of Wildflowers 5. Forever Blooms Task Give me a simple list of just the things that I must bring on a camping trip. The list should have 5 items. 1. **Tent:** Your shelter from the elements. 2. **Sleeping Bag:** Stay warm and comfortable. 3. **Sleeping Pad:** Insulation and cushioning for a good night's sleep. 4. **Headlamp or Flashlight:** Light for nighttime activities. 5. **First Aid Kit:** Be prepared for minor injuries. Entity Classify the following items as [large, small]: Elephant Mouse Snail Here's the classification: * **Elephant:** Large * **Mouse:** Small * **Snail:** Small Partial input completion Generative language models work like an advanced auto completion tool. When you provide partial content, the model can provide the rest of the content or what it thinks is a continuation of that content as a response. When doing so, if you include any examples or context, the model can take those examples or context into account. The following example provides a prompt with an instruction and an entity input: Prompt: For the given order, return a JSON object that has the fields cheeseburger, hamburger, fries, or drink, with the value being the quantity. Order: A burger and a drink. Response: { "cheeseburger": 0, "hamburger": 1, "fries": 0, "drink": 1 } (gemini-2.5-flash) While the model did as prompted, writing out the instructions in natural language can sometimes be challenging and it leaves a lot to the model's interpretation. For example, a restaurants menu might contain many items. To reduce the size of the JSON response, you probably want to omit the items that weren't ordered. In this case, you can give an example and a response prefix and let the model complete it: Prompt: Valid fields are cheeseburger, hamburger, fries, and drink. Order: Give me a cheeseburger and fries Output: ``` { "cheeseburger": 1, "fries": 1 } ``` Order: I want two burgers, a drink, and fries. Output: Response: ``` { "hamburger": 2, "drink": 1, "fries": 1 } ``` (gemini-2.5-flash) Notice how "cheeseburger" was excluded from the output because it wasn't a part of the order. Constraints Specify any constraints on reading the prompt or generating a response. You can tell the model what to do and not to do. For example, you can specify a constraint in the prompt on how long you want a summary to be: Prompt: Summarize this text in one sentence: Text: A quantum computer exploits quantum mechanical phenomena to perform calculations exponentially faster than any modern traditional computer. At very tiny scales, physical matter acts as both particles and as waves, and quantum computing uses specialized hardware to leverage this behavior. The operating principles of quantum devices is beyond the scope of classical physics. When deployed at scale, quantum computers could be used in a wide variety of applications such as: in cybersecurity to break existing encryption methods while helping researchers create new ones, in meteorology to develop better weather forecasting etc. However, the current state of the art quantum computers are still largely experimental and impractical. Response: Exploiting quantum mechanical phenomena, quantum computers can perform calculations exponentially faster than traditional computers for potential applications like cybersecurity and meteorology, although they are currently largely experimental and impractical. (gemini-2.5-flash) Response format You can give instructions that specify the format of the response. For example, you can ask for the response to be formatted as a table, bulleted list, elevator pitch, keywords, sentence, or paragraph. The following system instruction tells the model to be more conversational in its response: System instruction All questions should be answered comprehensively with details, unless the user requests a concise response specifically. Prompt What is a smart way to make a business that sells DVD's in 2025? Response: Starting a business selling DVDs in 2025 presents significant challenges due to the overwhelming dominance of digital streaming and on-demand services; a "smart" approach would therefore require a highly specialized niche, a strong value proposition beyond just the disc, and likely a focus on community or collector markets rather than mainstream consumption. Here's a breakdown of what a smart DVD business in 2025 would likely entail: 1. Extreme Niche Specialization: You cannot compete with streaming libraries. Your business must focus on what digital doesn't offer easily or at all. This could include: * Out-of-Print & Rare Titles: Sourcing films or TV shows that have never been released digitally or are currently unavailable on major streaming platforms. .... (gemini-2.5-flash) Format responses with the completion strategy The completion strategy can also help format the response. The following example prompts the model to create an essay outline: Prompt: Create an outline for an essay about hummingbirds. Response: Here is a possible outline for an essay about hummingbirds: I. Introduction A. Hook: Start with a captivating fact or description about hummingbirds (e.g., their speed, iridescent colors, or tiny size). B. Background: Briefly introduce hummingbirds – where they are found (Americas), their reputation as unique birds. C. Thesis Statement: State the main point of the essay, which could be about the remarkable adaptations, behaviors, and ecological importance of hummingbirds. II. Appearance and Identification ..... (gemini-2.5-flash) The prompt didn't specify the format for the outline and the model chose a format for you. To get the model to return an outline in a specific format, you can add text that represents the start of the outline and let the model complete it based on the pattern that you initiated. Prompt: Create an outline for an essay about hummingbirds. I. Introduction * Response: Okay, here is an outline for an essay about hummingbirds, starting from the introduction you provided: I. Introduction * Hook: Start with a captivating fact or description about hummingbirds (e.g., their speed, iridescent colors, or tiny size). * Background: Briefly introduce hummingbirds – where they are found (Americas), their reputation as unique birds, and their general characteristics. * Thesis Statement: State the main point of the essay, focusing on the remarkable adaptations, behaviors, and ecological significance that make hummingbirds extraordinary. ..... (gemini-2.5-flash) Zero-shot vs few-shot prompts You can include examples in the prompt that show the model what getting it right looks like. The model attempts to identify patterns and relationships from the examples and applies them when generating a response. Prompts that contain a few examples are called few-shot prompts, while prompts that provide no examples are called zero-shot prompts. Few-shot prompts are often used to regulate the formatting, phrasing, scoping, or general patterning of model responses. Use specific and varied examples to help the model narrow its focus and generate more accurate results. We recommend to always include few-shot examples in your prompts. Prompts without few-shot examples are likely to be less effective. In fact, you can remove instructions from your prompt if your examples are clear enough in showing the task at hand. The following zero-shot prompt asks the model to choose the best explanation. Prompt: Please choose the best explanation to the question: Question: How is snow formed? Explanation1: Snow is formed when water vapor in the air freezes into ice crystals in the atmosphere, which can combine and grow into snowflakes as they fall through the atmosphere and accumulate on the ground. Explanation2: Water vapor freezes into ice crystals forming snow. Answer: Response: Explanation1 is the better explanation because it provides more detail on the process, including how ice crystals combine and grow into snowflakes as they fall through the atmosphere. (gemini-2.5-flash) If your use case requires the model to produce concise responses, you can include examples in the prompt that give preference to concise responses. The following prompt provides two examples that show preference to the shorter explanations. In the response, you can see that the examples guided the model to choose the shorter explanation ( Explanation2 ) as opposed to the longer explanation ( Explanation1 ) like it did previously. Prompt: Below are some examples showing a question, explanation, and answer format: Question: Why is the sky blue? Explanation1: The sky appears blue because of Rayleigh scattering, which causes shorter blue wavelengths of light to be scattered more easily than longer red wavelengths, making the sky look blue. Explanation2: Due to Rayleigh scattering effect. Answer: Explanation2 Question: What is the cause of earthquakes? Explanation1: Sudden release of energy in the Earth's crust. Explanation2: Earthquakes happen when tectonic plates suddenly slip or break apart, causing a release of energy that creates seismic waves that can shake the ground and cause damage. Answer: Explanation1 Now, Answer the following question given the example formats above: Question: How is snow formed? Explanation1: Snow is formed when water vapor in the air freezes into ice crystals in the atmosphere, which can combine and grow into snowflakes as they fall through the atmosphere and accumulate on the ground. Explanation2: Water vapor freezes into ice crystals forming snow. Answer: Response: Answer: Explanation2 (gemini-2.5-flash) Optimal number of examples Models like Gemini can often pick up on patterns using a few examples, though you may need to experiment with the number of examples to provide in the prompt for the best results. At the same time, if you include too many examples, the model may start to overfit the response to the examples. Patterns vs anti patterns Using examples to show the model a pattern to follow is more effective than using examples to show the model an anti pattern to avoid. ⛔ Negative pattern: Example Don't end haikus with a question: Haiku are fun A short and simple poem Don't you enjoy them? ✅ Positive pattern: Example Always end haikus with an assertion: Haiku are fun A short and simple poem A joy to write Consistent formatting Make sure that the structure and formatting of few-shot examples are the same to avoid responses with undesired formats. One of the primary objectives of adding few-shot examples in prompts is to show the model the response format. Therefore, it is essential to ensure a consistent format across all examples, especially paying attention to XML tags, white spaces, newlines, and example splitters. Add context You can include instructions and information in a prompt that the model needs to solve a problem, instead of assuming that the model has all of the required information. This contextual information helps the model understand the constraints and details of what you're asking for it to do. The following example asks the model to give troubleshooting guidance for a router: Prompt: What should I do to fix my disconnected wifi? The light on my Google Wifi router is yellow and blinking slowly. Response: A yellow blinking light on your Google Wifi router usually indicates a problem with the internet connection. Here are some steps to troubleshoot and fix the issue: 1. Restart your router and modem: - Unplug your modem from the power outlet and wait for 30 seconds. - Plug your modem back in and wait for it to fully boot up. - Unplug your Google Wifi router and wait for 30 seconds. - Plug your Google Wifi router back in and wait for it to fully boot up. ..... (gemini-2.5-flash) The response looks like generic troubleshooting information that's not specific to the router or the status of the LED indicator lights. To customize the response for the specific router, you can add to the prompt the router's troubleshooting guide as context for it to refer to when providing a response. Prompt: Answer the question using the text below. Respond with only the text provided. Question: What should I do to fix my disconnected wifi? The light on my Google Wifi router is yellow and blinking slowly. Text: Color: Slowly pulsing yellow What it means: There is a network error. What to do: Check that the Ethernet cable is connected to both your router and your modem and both devices are turned on. You might need to unplug and plug in each device again. Color: Fast blinking yellow What it means: You are holding down the reset button and are factory resetting this device. What to do: If you keep holding down the reset button, after about 12 seconds, the light will turn solid yellow. Once it is solid yellow, let go of the factory reset button. Color: Solid yellow What it means: Router is factory resetting. What to do: This can take up to 10 minutes. When it's done, the device will reset itself and start pulsing white, letting you know it's ready for setup. Color: Solid red What it means: Something is wrong. What to do: Critical failure. Factory reset the router. If the light stays red, contact Wifi customer support. Response: Check that the Ethernet cable is connected to both your router and your modem and both devices are turned on. You might need to unplug and plug in each device again. (gemini-2.5-flash) Add prefixes A prefix is a word or phrase that you add to the prompt content that can serve several purposes, depending on where you put the prefix: Input prefix: Adding a prefix to the input signals semantically meaningful parts of the input to the model. For example, the prefixes "English:" and "French:" demarcate two different languages. Output prefix: Even though the output is generated by the model, you can add a prefix for the output in the prompt. The output prefix gives the model information about what's expected as a response. For example, the output prefix "JSON:" signals to the model that the output should be in JSON format. Example prefix: In few-shot prompts, adding prefixes to the examples provides labels that the model can use when generating the output, which makes it easier to parse output content. In the following example, "Text:" is the input prefix and "The answer is:" is the output prefix. Prompt: Classify the text as one of the following categories. - large - small Text: Rhino The answer is: large Text: Mouse The answer is: small Text: Snail The answer is: small Text: Elephant The answer is: Response: The answer is: large (gemini-2.5-flash) Break down prompts into components For use cases that require complex prompts, you can help the model manage this complexity by breaking things down into simpler components. Break down instructions: Instead of having many instructions in one prompt, create one prompt per instruction. You can choose which prompt to process based on the user's input. Chain prompts: For complex tasks that involve multiple sequential steps, make each step a prompt and chain the prompts together in a sequence. In this sequential chain of prompts, the output of one prompt in the sequence becomes the input of the next prompt. The output of the last prompt in the sequence is the final output. Aggregate responses: Aggregation is when you want to perform different parallel tasks on different portions of the data and aggregate the results to produce the final output. For example, you can tell the model to perform one operation on the first part of the data, perform another operation on the rest of the data and aggregate the results. Experiment with model parameters Each call that you send to a model includes parameter values that control how the model generates a response. The model can generate different results for different parameter values. Experiment with different parameter values to get the best values for the task. The parameters available for different models may differ. The most common parameters are the following: Max output tokens: Specifies the maximum number of tokens that can be generated in the response. A token is approximately four characters. 100 tokens correspond to roughly 60-80 words. Temperature: The temperature controls the degree of randomness in token selection. The temperature is used for sampling during response generation, which occurs when topP and topK are applied. Lower temperatures are good for prompts that require a more deterministic or less open-ended response, while higher temperatures can lead to more diverse or creative results. A temperature of 0 is deterministic, meaning that the highest probability response is always selected. topK : The topK parameter changes how the model selects tokens for output. A topK of 1 means the selected token is the most probable among all the tokens in the model's vocabulary (also called greedy decoding), while a topK of 3 means that the next token is selected from among the 3 most probable using the temperature. For each token selection step, the topK tokens with the highest probabilities are sampled. Tokens are then further filtered based on topP with the final token selected using temperature sampling. topP : The topP parameter changes how the model selects tokens for output. Tokens are selected from the most to least probable until the sum of their probabilities equals the topP value. For example, if tokens A, B, and C have a probability of 0.3, 0.2, and 0.1 and the topP value is 0.5, then the model will select either A or B as the next token by using the temperature and exclude C as a candidate. The default topP value is 0.95. stop_sequences : Set a stop sequence to tell the model to stop generating content. A stop sequence can be any sequence of characters. Try to avoid using a sequence of characters that may appear in the generated content. Prompt iteration strategies Prompt design can sometimes require a few iterations before you consistently get the response you're looking for. This section provides guidance on some things you can try when iterating on your prompts: Use different phrasing: Using different words or phrasing in your prompts often yields different responses from the model even though they all mean the same thing. If you're not getting the expected results from your prompt, try rephrasing it. Version 1: How do I bake a pie? Version 2: Suggest a recipe for a pie. Version 3: What's a good pie recipe? Switch to an analogous task: If you can't get the model to follow your instructions for a task, try giving it instructions for an analogous task that achieves the same result. This prompt tells the model to categorize a book by using predefined categories: Prompt: Which category does The Odyssey belong to: thriller sci-fi mythology biography Response: The Odyssey belongs to the category of **mythology**. Here's why: * **Mythology:** The Odyssey tells the story of Odysseus, a hero from Greek mythology, and his journey home after the Trojan War. It features gods, monsters, and supernatural events common to Greek mythology. ..... (gemini-2.5-flash) The response is correct, but the model didn't stay within the bounds of the options. You also want to model to just respond with one of the options instead of in a full sentence. In this case, you can rephrase the instructions as a multiple choice question and ask the model to choose an option. Prompt: Multiple choice problem: Which of the following options describes the book The Odyssey? Options: thriller sci-fi mythology biography Response: The correct answer is mythology . (gemini-2.5-flash) Change the order of prompt content: The order of the content in the prompt can sometimes affect the response. Try changing the content order and see how that affects the response. Version 1 : [ examples ] [ context ] [ input ] Version 2 : [ input ] [ examples ] [ context ] Version 3 : [ examples ] [ input ] [ context ] Fallback responses A fallback response is a response returned by the model when either the prompt or the response triggers a safety filter. An example of a fallback response is "I'm not able to help with that, as I'm only a language model." If the model responds with a fallback response, try increasing the temperature. Things to avoid Avoid relying on models to generate factual information. Use with care on math and logic problems. Generative models under the hood This section aims to answer the question - Is there randomness in generative models' responses, or are they deterministic? The short answer - yes to both. When you prompt a generative model, a text response is generated in two stages. In the first stage, the generative model processes the input prompt and generates a probability distribution over possible tokens (words) that are likely to come next. For example, if you prompt with the input text "The dog jumped over the ... ", the generative model will produce an array of probable next words: [("fence", 0.77), ("ledge", 0.12), ("blanket", 0.03), ...] This process is deterministic; a generative model will produce this same distribution every time it's input the same prompt text. In the second stage, the generative model converts these distributions into actual text responses through one of several decoding strategies. A simple decoding strategy might select the most likely token at every timestep. This process would always be deterministic. However, you could instead choose to generate a response by randomly sampling over the distribution returned by the model. This process would be stochastic (random). Control the degree of randomness allowed in this decoding process by setting the temperature. A temperature of 0 means only the most likely tokens are selected, and there's no randomness. Conversely, a high temperature injects a high degree of randomness into the tokens selected by the model, leading to more unexpected, surprising model responses. Next steps Now that you have a deeper understanding of prompt design, try writing your own prompts using Google AI Studio . To learn about multimodal prompting, see Prompting with media files . To learn about image prompting, see the Imagen prompt guide To learn about video prompting, see the Veo prompt guide Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-04-28 UTC. \ No newline at end of file diff --git a/text_content/docs_prompting-strategies_5f0a5292.txt b/text_content/docs_prompting-strategies_5f0a5292.txt new file mode 100644 index 0000000000000000000000000000000000000000..5c2fe03fb5ba90fa0a354c729f31ac139abce12c --- /dev/null +++ b/text_content/docs_prompting-strategies_5f0a5292.txt @@ -0,0 +1,5 @@ +URL: https://ai.google.dev/gemini-api/docs/prompting-strategies#prefixes +Title: Prompt design strategies  |  Gemini API  |  Google AI for Developers +================================================== + +Prompt design strategies | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Prompt design strategies Prompt design is the process of creating prompts, or natural language requests, that elicit accurate, high quality responses from a language model. This page introduces basic concepts, strategies, and best practices to get you started designing prompts to get the most out of Gemini AI models. Topic-specific prompt guides Looking for more specific prompt strategies? Check out our other prompting guides on: Prompting with media files Prompting for image generation Prompting for video generation Google AI Studio also hosts a prompt gallery meant to interactively showcase many of the concepts shared in this guide. Clear and specific instructions An effective and efficient way to customize model behavior is to provide it with clear and specific instructions. Instructions can be in the form of a question, step-by-step tasks, or as complex as mapping out a user's experience and mindset. Input Input is the required text in the prompt that you want the model to provide a response to. Inputs can be a question that the model answers (question input), a task the model performs (task input), an entity the model operates on (entity input), or partial input that the model completes or continues (completion input). Input type Prompt Generated output Question What's a good name for a flower shop that specializes in selling bouquets of dried flowers? Create a list of 5 options with just the names. Here are 10 names for a flower shop specializing in dried flowers: 1. Everlasting Blooms 2. Dried & Delightful 3. The Petal Preserve 4. Whispers of Wildflowers 5. Forever Blooms Task Give me a simple list of just the things that I must bring on a camping trip. The list should have 5 items. 1. **Tent:** Your shelter from the elements. 2. **Sleeping Bag:** Stay warm and comfortable. 3. **Sleeping Pad:** Insulation and cushioning for a good night's sleep. 4. **Headlamp or Flashlight:** Light for nighttime activities. 5. **First Aid Kit:** Be prepared for minor injuries. Entity Classify the following items as [large, small]: Elephant Mouse Snail Here's the classification: * **Elephant:** Large * **Mouse:** Small * **Snail:** Small Partial input completion Generative language models work like an advanced auto completion tool. When you provide partial content, the model can provide the rest of the content or what it thinks is a continuation of that content as a response. When doing so, if you include any examples or context, the model can take those examples or context into account. The following example provides a prompt with an instruction and an entity input: Prompt: For the given order, return a JSON object that has the fields cheeseburger, hamburger, fries, or drink, with the value being the quantity. Order: A burger and a drink. Response: { "cheeseburger": 0, "hamburger": 1, "fries": 0, "drink": 1 } (gemini-2.5-flash) While the model did as prompted, writing out the instructions in natural language can sometimes be challenging and it leaves a lot to the model's interpretation. For example, a restaurants menu might contain many items. To reduce the size of the JSON response, you probably want to omit the items that weren't ordered. In this case, you can give an example and a response prefix and let the model complete it: Prompt: Valid fields are cheeseburger, hamburger, fries, and drink. Order: Give me a cheeseburger and fries Output: ``` { "cheeseburger": 1, "fries": 1 } ``` Order: I want two burgers, a drink, and fries. Output: Response: ``` { "hamburger": 2, "drink": 1, "fries": 1 } ``` (gemini-2.5-flash) Notice how "cheeseburger" was excluded from the output because it wasn't a part of the order. Constraints Specify any constraints on reading the prompt or generating a response. You can tell the model what to do and not to do. For example, you can specify a constraint in the prompt on how long you want a summary to be: Prompt: Summarize this text in one sentence: Text: A quantum computer exploits quantum mechanical phenomena to perform calculations exponentially faster than any modern traditional computer. At very tiny scales, physical matter acts as both particles and as waves, and quantum computing uses specialized hardware to leverage this behavior. The operating principles of quantum devices is beyond the scope of classical physics. When deployed at scale, quantum computers could be used in a wide variety of applications such as: in cybersecurity to break existing encryption methods while helping researchers create new ones, in meteorology to develop better weather forecasting etc. However, the current state of the art quantum computers are still largely experimental and impractical. Response: Exploiting quantum mechanical phenomena, quantum computers can perform calculations exponentially faster than traditional computers for potential applications like cybersecurity and meteorology, although they are currently largely experimental and impractical. (gemini-2.5-flash) Response format You can give instructions that specify the format of the response. For example, you can ask for the response to be formatted as a table, bulleted list, elevator pitch, keywords, sentence, or paragraph. The following system instruction tells the model to be more conversational in its response: System instruction All questions should be answered comprehensively with details, unless the user requests a concise response specifically. Prompt What is a smart way to make a business that sells DVD's in 2025? Response: Starting a business selling DVDs in 2025 presents significant challenges due to the overwhelming dominance of digital streaming and on-demand services; a "smart" approach would therefore require a highly specialized niche, a strong value proposition beyond just the disc, and likely a focus on community or collector markets rather than mainstream consumption. Here's a breakdown of what a smart DVD business in 2025 would likely entail: 1. Extreme Niche Specialization: You cannot compete with streaming libraries. Your business must focus on what digital doesn't offer easily or at all. This could include: * Out-of-Print & Rare Titles: Sourcing films or TV shows that have never been released digitally or are currently unavailable on major streaming platforms. .... (gemini-2.5-flash) Format responses with the completion strategy The completion strategy can also help format the response. The following example prompts the model to create an essay outline: Prompt: Create an outline for an essay about hummingbirds. Response: Here is a possible outline for an essay about hummingbirds: I. Introduction A. Hook: Start with a captivating fact or description about hummingbirds (e.g., their speed, iridescent colors, or tiny size). B. Background: Briefly introduce hummingbirds – where they are found (Americas), their reputation as unique birds. C. Thesis Statement: State the main point of the essay, which could be about the remarkable adaptations, behaviors, and ecological importance of hummingbirds. II. Appearance and Identification ..... (gemini-2.5-flash) The prompt didn't specify the format for the outline and the model chose a format for you. To get the model to return an outline in a specific format, you can add text that represents the start of the outline and let the model complete it based on the pattern that you initiated. Prompt: Create an outline for an essay about hummingbirds. I. Introduction * Response: Okay, here is an outline for an essay about hummingbirds, starting from the introduction you provided: I. Introduction * Hook: Start with a captivating fact or description about hummingbirds (e.g., their speed, iridescent colors, or tiny size). * Background: Briefly introduce hummingbirds – where they are found (Americas), their reputation as unique birds, and their general characteristics. * Thesis Statement: State the main point of the essay, focusing on the remarkable adaptations, behaviors, and ecological significance that make hummingbirds extraordinary. ..... (gemini-2.5-flash) Zero-shot vs few-shot prompts You can include examples in the prompt that show the model what getting it right looks like. The model attempts to identify patterns and relationships from the examples and applies them when generating a response. Prompts that contain a few examples are called few-shot prompts, while prompts that provide no examples are called zero-shot prompts. Few-shot prompts are often used to regulate the formatting, phrasing, scoping, or general patterning of model responses. Use specific and varied examples to help the model narrow its focus and generate more accurate results. We recommend to always include few-shot examples in your prompts. Prompts without few-shot examples are likely to be less effective. In fact, you can remove instructions from your prompt if your examples are clear enough in showing the task at hand. The following zero-shot prompt asks the model to choose the best explanation. Prompt: Please choose the best explanation to the question: Question: How is snow formed? Explanation1: Snow is formed when water vapor in the air freezes into ice crystals in the atmosphere, which can combine and grow into snowflakes as they fall through the atmosphere and accumulate on the ground. Explanation2: Water vapor freezes into ice crystals forming snow. Answer: Response: Explanation1 is the better explanation because it provides more detail on the process, including how ice crystals combine and grow into snowflakes as they fall through the atmosphere. (gemini-2.5-flash) If your use case requires the model to produce concise responses, you can include examples in the prompt that give preference to concise responses. The following prompt provides two examples that show preference to the shorter explanations. In the response, you can see that the examples guided the model to choose the shorter explanation ( Explanation2 ) as opposed to the longer explanation ( Explanation1 ) like it did previously. Prompt: Below are some examples showing a question, explanation, and answer format: Question: Why is the sky blue? Explanation1: The sky appears blue because of Rayleigh scattering, which causes shorter blue wavelengths of light to be scattered more easily than longer red wavelengths, making the sky look blue. Explanation2: Due to Rayleigh scattering effect. Answer: Explanation2 Question: What is the cause of earthquakes? Explanation1: Sudden release of energy in the Earth's crust. Explanation2: Earthquakes happen when tectonic plates suddenly slip or break apart, causing a release of energy that creates seismic waves that can shake the ground and cause damage. Answer: Explanation1 Now, Answer the following question given the example formats above: Question: How is snow formed? Explanation1: Snow is formed when water vapor in the air freezes into ice crystals in the atmosphere, which can combine and grow into snowflakes as they fall through the atmosphere and accumulate on the ground. Explanation2: Water vapor freezes into ice crystals forming snow. Answer: Response: Answer: Explanation2 (gemini-2.5-flash) Optimal number of examples Models like Gemini can often pick up on patterns using a few examples, though you may need to experiment with the number of examples to provide in the prompt for the best results. At the same time, if you include too many examples, the model may start to overfit the response to the examples. Patterns vs anti patterns Using examples to show the model a pattern to follow is more effective than using examples to show the model an anti pattern to avoid. ⛔ Negative pattern: Example Don't end haikus with a question: Haiku are fun A short and simple poem Don't you enjoy them? ✅ Positive pattern: Example Always end haikus with an assertion: Haiku are fun A short and simple poem A joy to write Consistent formatting Make sure that the structure and formatting of few-shot examples are the same to avoid responses with undesired formats. One of the primary objectives of adding few-shot examples in prompts is to show the model the response format. Therefore, it is essential to ensure a consistent format across all examples, especially paying attention to XML tags, white spaces, newlines, and example splitters. Add context You can include instructions and information in a prompt that the model needs to solve a problem, instead of assuming that the model has all of the required information. This contextual information helps the model understand the constraints and details of what you're asking for it to do. The following example asks the model to give troubleshooting guidance for a router: Prompt: What should I do to fix my disconnected wifi? The light on my Google Wifi router is yellow and blinking slowly. Response: A yellow blinking light on your Google Wifi router usually indicates a problem with the internet connection. Here are some steps to troubleshoot and fix the issue: 1. Restart your router and modem: - Unplug your modem from the power outlet and wait for 30 seconds. - Plug your modem back in and wait for it to fully boot up. - Unplug your Google Wifi router and wait for 30 seconds. - Plug your Google Wifi router back in and wait for it to fully boot up. ..... (gemini-2.5-flash) The response looks like generic troubleshooting information that's not specific to the router or the status of the LED indicator lights. To customize the response for the specific router, you can add to the prompt the router's troubleshooting guide as context for it to refer to when providing a response. Prompt: Answer the question using the text below. Respond with only the text provided. Question: What should I do to fix my disconnected wifi? The light on my Google Wifi router is yellow and blinking slowly. Text: Color: Slowly pulsing yellow What it means: There is a network error. What to do: Check that the Ethernet cable is connected to both your router and your modem and both devices are turned on. You might need to unplug and plug in each device again. Color: Fast blinking yellow What it means: You are holding down the reset button and are factory resetting this device. What to do: If you keep holding down the reset button, after about 12 seconds, the light will turn solid yellow. Once it is solid yellow, let go of the factory reset button. Color: Solid yellow What it means: Router is factory resetting. What to do: This can take up to 10 minutes. When it's done, the device will reset itself and start pulsing white, letting you know it's ready for setup. Color: Solid red What it means: Something is wrong. What to do: Critical failure. Factory reset the router. If the light stays red, contact Wifi customer support. Response: Check that the Ethernet cable is connected to both your router and your modem and both devices are turned on. You might need to unplug and plug in each device again. (gemini-2.5-flash) Add prefixes A prefix is a word or phrase that you add to the prompt content that can serve several purposes, depending on where you put the prefix: Input prefix: Adding a prefix to the input signals semantically meaningful parts of the input to the model. For example, the prefixes "English:" and "French:" demarcate two different languages. Output prefix: Even though the output is generated by the model, you can add a prefix for the output in the prompt. The output prefix gives the model information about what's expected as a response. For example, the output prefix "JSON:" signals to the model that the output should be in JSON format. Example prefix: In few-shot prompts, adding prefixes to the examples provides labels that the model can use when generating the output, which makes it easier to parse output content. In the following example, "Text:" is the input prefix and "The answer is:" is the output prefix. Prompt: Classify the text as one of the following categories. - large - small Text: Rhino The answer is: large Text: Mouse The answer is: small Text: Snail The answer is: small Text: Elephant The answer is: Response: The answer is: large (gemini-2.5-flash) Break down prompts into components For use cases that require complex prompts, you can help the model manage this complexity by breaking things down into simpler components. Break down instructions: Instead of having many instructions in one prompt, create one prompt per instruction. You can choose which prompt to process based on the user's input. Chain prompts: For complex tasks that involve multiple sequential steps, make each step a prompt and chain the prompts together in a sequence. In this sequential chain of prompts, the output of one prompt in the sequence becomes the input of the next prompt. The output of the last prompt in the sequence is the final output. Aggregate responses: Aggregation is when you want to perform different parallel tasks on different portions of the data and aggregate the results to produce the final output. For example, you can tell the model to perform one operation on the first part of the data, perform another operation on the rest of the data and aggregate the results. Experiment with model parameters Each call that you send to a model includes parameter values that control how the model generates a response. The model can generate different results for different parameter values. Experiment with different parameter values to get the best values for the task. The parameters available for different models may differ. The most common parameters are the following: Max output tokens: Specifies the maximum number of tokens that can be generated in the response. A token is approximately four characters. 100 tokens correspond to roughly 60-80 words. Temperature: The temperature controls the degree of randomness in token selection. The temperature is used for sampling during response generation, which occurs when topP and topK are applied. Lower temperatures are good for prompts that require a more deterministic or less open-ended response, while higher temperatures can lead to more diverse or creative results. A temperature of 0 is deterministic, meaning that the highest probability response is always selected. topK : The topK parameter changes how the model selects tokens for output. A topK of 1 means the selected token is the most probable among all the tokens in the model's vocabulary (also called greedy decoding), while a topK of 3 means that the next token is selected from among the 3 most probable using the temperature. For each token selection step, the topK tokens with the highest probabilities are sampled. Tokens are then further filtered based on topP with the final token selected using temperature sampling. topP : The topP parameter changes how the model selects tokens for output. Tokens are selected from the most to least probable until the sum of their probabilities equals the topP value. For example, if tokens A, B, and C have a probability of 0.3, 0.2, and 0.1 and the topP value is 0.5, then the model will select either A or B as the next token by using the temperature and exclude C as a candidate. The default topP value is 0.95. stop_sequences : Set a stop sequence to tell the model to stop generating content. A stop sequence can be any sequence of characters. Try to avoid using a sequence of characters that may appear in the generated content. Prompt iteration strategies Prompt design can sometimes require a few iterations before you consistently get the response you're looking for. This section provides guidance on some things you can try when iterating on your prompts: Use different phrasing: Using different words or phrasing in your prompts often yields different responses from the model even though they all mean the same thing. If you're not getting the expected results from your prompt, try rephrasing it. Version 1: How do I bake a pie? Version 2: Suggest a recipe for a pie. Version 3: What's a good pie recipe? Switch to an analogous task: If you can't get the model to follow your instructions for a task, try giving it instructions for an analogous task that achieves the same result. This prompt tells the model to categorize a book by using predefined categories: Prompt: Which category does The Odyssey belong to: thriller sci-fi mythology biography Response: The Odyssey belongs to the category of **mythology**. Here's why: * **Mythology:** The Odyssey tells the story of Odysseus, a hero from Greek mythology, and his journey home after the Trojan War. It features gods, monsters, and supernatural events common to Greek mythology. ..... (gemini-2.5-flash) The response is correct, but the model didn't stay within the bounds of the options. You also want to model to just respond with one of the options instead of in a full sentence. In this case, you can rephrase the instructions as a multiple choice question and ask the model to choose an option. Prompt: Multiple choice problem: Which of the following options describes the book The Odyssey? Options: thriller sci-fi mythology biography Response: The correct answer is mythology . (gemini-2.5-flash) Change the order of prompt content: The order of the content in the prompt can sometimes affect the response. Try changing the content order and see how that affects the response. Version 1 : [ examples ] [ context ] [ input ] Version 2 : [ input ] [ examples ] [ context ] Version 3 : [ examples ] [ input ] [ context ] Fallback responses A fallback response is a response returned by the model when either the prompt or the response triggers a safety filter. An example of a fallback response is "I'm not able to help with that, as I'm only a language model." If the model responds with a fallback response, try increasing the temperature. Things to avoid Avoid relying on models to generate factual information. Use with care on math and logic problems. Generative models under the hood This section aims to answer the question - Is there randomness in generative models' responses, or are they deterministic? The short answer - yes to both. When you prompt a generative model, a text response is generated in two stages. In the first stage, the generative model processes the input prompt and generates a probability distribution over possible tokens (words) that are likely to come next. For example, if you prompt with the input text "The dog jumped over the ... ", the generative model will produce an array of probable next words: [("fence", 0.77), ("ledge", 0.12), ("blanket", 0.03), ...] This process is deterministic; a generative model will produce this same distribution every time it's input the same prompt text. In the second stage, the generative model converts these distributions into actual text responses through one of several decoding strategies. A simple decoding strategy might select the most likely token at every timestep. This process would always be deterministic. However, you could instead choose to generate a response by randomly sampling over the distribution returned by the model. This process would be stochastic (random). Control the degree of randomness allowed in this decoding process by setting the temperature. A temperature of 0 means only the most likely tokens are selected, and there's no randomness. Conversely, a high temperature injects a high degree of randomness into the tokens selected by the model, leading to more unexpected, surprising model responses. Next steps Now that you have a deeper understanding of prompt design, try writing your own prompts using Google AI Studio . To learn about multimodal prompting, see Prompting with media files . To learn about image prompting, see the Imagen prompt guide To learn about video prompting, see the Veo prompt guide Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-04-28 UTC. \ No newline at end of file diff --git a/text_content/docs_prompting-strategies_8f25ac45.txt b/text_content/docs_prompting-strategies_8f25ac45.txt new file mode 100644 index 0000000000000000000000000000000000000000..443b434f92359ac5c5da50170e65c1ec240cf7ca --- /dev/null +++ b/text_content/docs_prompting-strategies_8f25ac45.txt @@ -0,0 +1,5 @@ +URL: https://ai.google.dev/gemini-api/docs/prompting-strategies +Title: Prompt design strategies  |  Gemini API  |  Google AI for Developers +================================================== + +Prompt design strategies | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Prompt design strategies Prompt design is the process of creating prompts, or natural language requests, that elicit accurate, high quality responses from a language model. This page introduces basic concepts, strategies, and best practices to get you started designing prompts to get the most out of Gemini AI models. Topic-specific prompt guides Looking for more specific prompt strategies? Check out our other prompting guides on: Prompting with media files Prompting for image generation Prompting for video generation Google AI Studio also hosts a prompt gallery meant to interactively showcase many of the concepts shared in this guide. Clear and specific instructions An effective and efficient way to customize model behavior is to provide it with clear and specific instructions. Instructions can be in the form of a question, step-by-step tasks, or as complex as mapping out a user's experience and mindset. Input Input is the required text in the prompt that you want the model to provide a response to. Inputs can be a question that the model answers (question input), a task the model performs (task input), an entity the model operates on (entity input), or partial input that the model completes or continues (completion input). Input type Prompt Generated output Question What's a good name for a flower shop that specializes in selling bouquets of dried flowers? Create a list of 5 options with just the names. Here are 10 names for a flower shop specializing in dried flowers: 1. Everlasting Blooms 2. Dried & Delightful 3. The Petal Preserve 4. Whispers of Wildflowers 5. Forever Blooms Task Give me a simple list of just the things that I must bring on a camping trip. The list should have 5 items. 1. **Tent:** Your shelter from the elements. 2. **Sleeping Bag:** Stay warm and comfortable. 3. **Sleeping Pad:** Insulation and cushioning for a good night's sleep. 4. **Headlamp or Flashlight:** Light for nighttime activities. 5. **First Aid Kit:** Be prepared for minor injuries. Entity Classify the following items as [large, small]: Elephant Mouse Snail Here's the classification: * **Elephant:** Large * **Mouse:** Small * **Snail:** Small Partial input completion Generative language models work like an advanced auto completion tool. When you provide partial content, the model can provide the rest of the content or what it thinks is a continuation of that content as a response. When doing so, if you include any examples or context, the model can take those examples or context into account. The following example provides a prompt with an instruction and an entity input: Prompt: For the given order, return a JSON object that has the fields cheeseburger, hamburger, fries, or drink, with the value being the quantity. Order: A burger and a drink. Response: { "cheeseburger": 0, "hamburger": 1, "fries": 0, "drink": 1 } (gemini-2.5-flash) While the model did as prompted, writing out the instructions in natural language can sometimes be challenging and it leaves a lot to the model's interpretation. For example, a restaurants menu might contain many items. To reduce the size of the JSON response, you probably want to omit the items that weren't ordered. In this case, you can give an example and a response prefix and let the model complete it: Prompt: Valid fields are cheeseburger, hamburger, fries, and drink. Order: Give me a cheeseburger and fries Output: ``` { "cheeseburger": 1, "fries": 1 } ``` Order: I want two burgers, a drink, and fries. Output: Response: ``` { "hamburger": 2, "drink": 1, "fries": 1 } ``` (gemini-2.5-flash) Notice how "cheeseburger" was excluded from the output because it wasn't a part of the order. Constraints Specify any constraints on reading the prompt or generating a response. You can tell the model what to do and not to do. For example, you can specify a constraint in the prompt on how long you want a summary to be: Prompt: Summarize this text in one sentence: Text: A quantum computer exploits quantum mechanical phenomena to perform calculations exponentially faster than any modern traditional computer. At very tiny scales, physical matter acts as both particles and as waves, and quantum computing uses specialized hardware to leverage this behavior. The operating principles of quantum devices is beyond the scope of classical physics. When deployed at scale, quantum computers could be used in a wide variety of applications such as: in cybersecurity to break existing encryption methods while helping researchers create new ones, in meteorology to develop better weather forecasting etc. However, the current state of the art quantum computers are still largely experimental and impractical. Response: Exploiting quantum mechanical phenomena, quantum computers can perform calculations exponentially faster than traditional computers for potential applications like cybersecurity and meteorology, although they are currently largely experimental and impractical. (gemini-2.5-flash) Response format You can give instructions that specify the format of the response. For example, you can ask for the response to be formatted as a table, bulleted list, elevator pitch, keywords, sentence, or paragraph. The following system instruction tells the model to be more conversational in its response: System instruction All questions should be answered comprehensively with details, unless the user requests a concise response specifically. Prompt What is a smart way to make a business that sells DVD's in 2025? Response: Starting a business selling DVDs in 2025 presents significant challenges due to the overwhelming dominance of digital streaming and on-demand services; a "smart" approach would therefore require a highly specialized niche, a strong value proposition beyond just the disc, and likely a focus on community or collector markets rather than mainstream consumption. Here's a breakdown of what a smart DVD business in 2025 would likely entail: 1. Extreme Niche Specialization: You cannot compete with streaming libraries. Your business must focus on what digital doesn't offer easily or at all. This could include: * Out-of-Print & Rare Titles: Sourcing films or TV shows that have never been released digitally or are currently unavailable on major streaming platforms. .... (gemini-2.5-flash) Format responses with the completion strategy The completion strategy can also help format the response. The following example prompts the model to create an essay outline: Prompt: Create an outline for an essay about hummingbirds. Response: Here is a possible outline for an essay about hummingbirds: I. Introduction A. Hook: Start with a captivating fact or description about hummingbirds (e.g., their speed, iridescent colors, or tiny size). B. Background: Briefly introduce hummingbirds – where they are found (Americas), their reputation as unique birds. C. Thesis Statement: State the main point of the essay, which could be about the remarkable adaptations, behaviors, and ecological importance of hummingbirds. II. Appearance and Identification ..... (gemini-2.5-flash) The prompt didn't specify the format for the outline and the model chose a format for you. To get the model to return an outline in a specific format, you can add text that represents the start of the outline and let the model complete it based on the pattern that you initiated. Prompt: Create an outline for an essay about hummingbirds. I. Introduction * Response: Okay, here is an outline for an essay about hummingbirds, starting from the introduction you provided: I. Introduction * Hook: Start with a captivating fact or description about hummingbirds (e.g., their speed, iridescent colors, or tiny size). * Background: Briefly introduce hummingbirds – where they are found (Americas), their reputation as unique birds, and their general characteristics. * Thesis Statement: State the main point of the essay, focusing on the remarkable adaptations, behaviors, and ecological significance that make hummingbirds extraordinary. ..... (gemini-2.5-flash) Zero-shot vs few-shot prompts You can include examples in the prompt that show the model what getting it right looks like. The model attempts to identify patterns and relationships from the examples and applies them when generating a response. Prompts that contain a few examples are called few-shot prompts, while prompts that provide no examples are called zero-shot prompts. Few-shot prompts are often used to regulate the formatting, phrasing, scoping, or general patterning of model responses. Use specific and varied examples to help the model narrow its focus and generate more accurate results. We recommend to always include few-shot examples in your prompts. Prompts without few-shot examples are likely to be less effective. In fact, you can remove instructions from your prompt if your examples are clear enough in showing the task at hand. The following zero-shot prompt asks the model to choose the best explanation. Prompt: Please choose the best explanation to the question: Question: How is snow formed? Explanation1: Snow is formed when water vapor in the air freezes into ice crystals in the atmosphere, which can combine and grow into snowflakes as they fall through the atmosphere and accumulate on the ground. Explanation2: Water vapor freezes into ice crystals forming snow. Answer: Response: Explanation1 is the better explanation because it provides more detail on the process, including how ice crystals combine and grow into snowflakes as they fall through the atmosphere. (gemini-2.5-flash) If your use case requires the model to produce concise responses, you can include examples in the prompt that give preference to concise responses. The following prompt provides two examples that show preference to the shorter explanations. In the response, you can see that the examples guided the model to choose the shorter explanation ( Explanation2 ) as opposed to the longer explanation ( Explanation1 ) like it did previously. Prompt: Below are some examples showing a question, explanation, and answer format: Question: Why is the sky blue? Explanation1: The sky appears blue because of Rayleigh scattering, which causes shorter blue wavelengths of light to be scattered more easily than longer red wavelengths, making the sky look blue. Explanation2: Due to Rayleigh scattering effect. Answer: Explanation2 Question: What is the cause of earthquakes? Explanation1: Sudden release of energy in the Earth's crust. Explanation2: Earthquakes happen when tectonic plates suddenly slip or break apart, causing a release of energy that creates seismic waves that can shake the ground and cause damage. Answer: Explanation1 Now, Answer the following question given the example formats above: Question: How is snow formed? Explanation1: Snow is formed when water vapor in the air freezes into ice crystals in the atmosphere, which can combine and grow into snowflakes as they fall through the atmosphere and accumulate on the ground. Explanation2: Water vapor freezes into ice crystals forming snow. Answer: Response: Answer: Explanation2 (gemini-2.5-flash) Optimal number of examples Models like Gemini can often pick up on patterns using a few examples, though you may need to experiment with the number of examples to provide in the prompt for the best results. At the same time, if you include too many examples, the model may start to overfit the response to the examples. Patterns vs anti patterns Using examples to show the model a pattern to follow is more effective than using examples to show the model an anti pattern to avoid. ⛔ Negative pattern: Example Don't end haikus with a question: Haiku are fun A short and simple poem Don't you enjoy them? ✅ Positive pattern: Example Always end haikus with an assertion: Haiku are fun A short and simple poem A joy to write Consistent formatting Make sure that the structure and formatting of few-shot examples are the same to avoid responses with undesired formats. One of the primary objectives of adding few-shot examples in prompts is to show the model the response format. Therefore, it is essential to ensure a consistent format across all examples, especially paying attention to XML tags, white spaces, newlines, and example splitters. Add context You can include instructions and information in a prompt that the model needs to solve a problem, instead of assuming that the model has all of the required information. This contextual information helps the model understand the constraints and details of what you're asking for it to do. The following example asks the model to give troubleshooting guidance for a router: Prompt: What should I do to fix my disconnected wifi? The light on my Google Wifi router is yellow and blinking slowly. Response: A yellow blinking light on your Google Wifi router usually indicates a problem with the internet connection. Here are some steps to troubleshoot and fix the issue: 1. Restart your router and modem: - Unplug your modem from the power outlet and wait for 30 seconds. - Plug your modem back in and wait for it to fully boot up. - Unplug your Google Wifi router and wait for 30 seconds. - Plug your Google Wifi router back in and wait for it to fully boot up. ..... (gemini-2.5-flash) The response looks like generic troubleshooting information that's not specific to the router or the status of the LED indicator lights. To customize the response for the specific router, you can add to the prompt the router's troubleshooting guide as context for it to refer to when providing a response. Prompt: Answer the question using the text below. Respond with only the text provided. Question: What should I do to fix my disconnected wifi? The light on my Google Wifi router is yellow and blinking slowly. Text: Color: Slowly pulsing yellow What it means: There is a network error. What to do: Check that the Ethernet cable is connected to both your router and your modem and both devices are turned on. You might need to unplug and plug in each device again. Color: Fast blinking yellow What it means: You are holding down the reset button and are factory resetting this device. What to do: If you keep holding down the reset button, after about 12 seconds, the light will turn solid yellow. Once it is solid yellow, let go of the factory reset button. Color: Solid yellow What it means: Router is factory resetting. What to do: This can take up to 10 minutes. When it's done, the device will reset itself and start pulsing white, letting you know it's ready for setup. Color: Solid red What it means: Something is wrong. What to do: Critical failure. Factory reset the router. If the light stays red, contact Wifi customer support. Response: Check that the Ethernet cable is connected to both your router and your modem and both devices are turned on. You might need to unplug and plug in each device again. (gemini-2.5-flash) Add prefixes A prefix is a word or phrase that you add to the prompt content that can serve several purposes, depending on where you put the prefix: Input prefix: Adding a prefix to the input signals semantically meaningful parts of the input to the model. For example, the prefixes "English:" and "French:" demarcate two different languages. Output prefix: Even though the output is generated by the model, you can add a prefix for the output in the prompt. The output prefix gives the model information about what's expected as a response. For example, the output prefix "JSON:" signals to the model that the output should be in JSON format. Example prefix: In few-shot prompts, adding prefixes to the examples provides labels that the model can use when generating the output, which makes it easier to parse output content. In the following example, "Text:" is the input prefix and "The answer is:" is the output prefix. Prompt: Classify the text as one of the following categories. - large - small Text: Rhino The answer is: large Text: Mouse The answer is: small Text: Snail The answer is: small Text: Elephant The answer is: Response: The answer is: large (gemini-2.5-flash) Break down prompts into components For use cases that require complex prompts, you can help the model manage this complexity by breaking things down into simpler components. Break down instructions: Instead of having many instructions in one prompt, create one prompt per instruction. You can choose which prompt to process based on the user's input. Chain prompts: For complex tasks that involve multiple sequential steps, make each step a prompt and chain the prompts together in a sequence. In this sequential chain of prompts, the output of one prompt in the sequence becomes the input of the next prompt. The output of the last prompt in the sequence is the final output. Aggregate responses: Aggregation is when you want to perform different parallel tasks on different portions of the data and aggregate the results to produce the final output. For example, you can tell the model to perform one operation on the first part of the data, perform another operation on the rest of the data and aggregate the results. Experiment with model parameters Each call that you send to a model includes parameter values that control how the model generates a response. The model can generate different results for different parameter values. Experiment with different parameter values to get the best values for the task. The parameters available for different models may differ. The most common parameters are the following: Max output tokens: Specifies the maximum number of tokens that can be generated in the response. A token is approximately four characters. 100 tokens correspond to roughly 60-80 words. Temperature: The temperature controls the degree of randomness in token selection. The temperature is used for sampling during response generation, which occurs when topP and topK are applied. Lower temperatures are good for prompts that require a more deterministic or less open-ended response, while higher temperatures can lead to more diverse or creative results. A temperature of 0 is deterministic, meaning that the highest probability response is always selected. topK : The topK parameter changes how the model selects tokens for output. A topK of 1 means the selected token is the most probable among all the tokens in the model's vocabulary (also called greedy decoding), while a topK of 3 means that the next token is selected from among the 3 most probable using the temperature. For each token selection step, the topK tokens with the highest probabilities are sampled. Tokens are then further filtered based on topP with the final token selected using temperature sampling. topP : The topP parameter changes how the model selects tokens for output. Tokens are selected from the most to least probable until the sum of their probabilities equals the topP value. For example, if tokens A, B, and C have a probability of 0.3, 0.2, and 0.1 and the topP value is 0.5, then the model will select either A or B as the next token by using the temperature and exclude C as a candidate. The default topP value is 0.95. stop_sequences : Set a stop sequence to tell the model to stop generating content. A stop sequence can be any sequence of characters. Try to avoid using a sequence of characters that may appear in the generated content. Prompt iteration strategies Prompt design can sometimes require a few iterations before you consistently get the response you're looking for. This section provides guidance on some things you can try when iterating on your prompts: Use different phrasing: Using different words or phrasing in your prompts often yields different responses from the model even though they all mean the same thing. If you're not getting the expected results from your prompt, try rephrasing it. Version 1: How do I bake a pie? Version 2: Suggest a recipe for a pie. Version 3: What's a good pie recipe? Switch to an analogous task: If you can't get the model to follow your instructions for a task, try giving it instructions for an analogous task that achieves the same result. This prompt tells the model to categorize a book by using predefined categories: Prompt: Which category does The Odyssey belong to: thriller sci-fi mythology biography Response: The Odyssey belongs to the category of **mythology**. Here's why: * **Mythology:** The Odyssey tells the story of Odysseus, a hero from Greek mythology, and his journey home after the Trojan War. It features gods, monsters, and supernatural events common to Greek mythology. ..... (gemini-2.5-flash) The response is correct, but the model didn't stay within the bounds of the options. You also want to model to just respond with one of the options instead of in a full sentence. In this case, you can rephrase the instructions as a multiple choice question and ask the model to choose an option. Prompt: Multiple choice problem: Which of the following options describes the book The Odyssey? Options: thriller sci-fi mythology biography Response: The correct answer is mythology . (gemini-2.5-flash) Change the order of prompt content: The order of the content in the prompt can sometimes affect the response. Try changing the content order and see how that affects the response. Version 1 : [ examples ] [ context ] [ input ] Version 2 : [ input ] [ examples ] [ context ] Version 3 : [ examples ] [ input ] [ context ] Fallback responses A fallback response is a response returned by the model when either the prompt or the response triggers a safety filter. An example of a fallback response is "I'm not able to help with that, as I'm only a language model." If the model responds with a fallback response, try increasing the temperature. Things to avoid Avoid relying on models to generate factual information. Use with care on math and logic problems. Generative models under the hood This section aims to answer the question - Is there randomness in generative models' responses, or are they deterministic? The short answer - yes to both. When you prompt a generative model, a text response is generated in two stages. In the first stage, the generative model processes the input prompt and generates a probability distribution over possible tokens (words) that are likely to come next. For example, if you prompt with the input text "The dog jumped over the ... ", the generative model will produce an array of probable next words: [("fence", 0.77), ("ledge", 0.12), ("blanket", 0.03), ...] This process is deterministic; a generative model will produce this same distribution every time it's input the same prompt text. In the second stage, the generative model converts these distributions into actual text responses through one of several decoding strategies. A simple decoding strategy might select the most likely token at every timestep. This process would always be deterministic. However, you could instead choose to generate a response by randomly sampling over the distribution returned by the model. This process would be stochastic (random). Control the degree of randomness allowed in this decoding process by setting the temperature. A temperature of 0 means only the most likely tokens are selected, and there's no randomness. Conversely, a high temperature injects a high degree of randomness into the tokens selected by the model, leading to more unexpected, surprising model responses. Next steps Now that you have a deeper understanding of prompt design, try writing your own prompts using Google AI Studio . To learn about multimodal prompting, see Prompting with media files . To learn about image prompting, see the Imagen prompt guide To learn about video prompting, see the Veo prompt guide Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-04-28 UTC. \ No newline at end of file diff --git a/text_content/docs_prompting_with_media_1b5d173d.txt b/text_content/docs_prompting_with_media_1b5d173d.txt new file mode 100644 index 0000000000000000000000000000000000000000..0668d6cdc14ea63bcb13ac991735f6d378418191 --- /dev/null +++ b/text_content/docs_prompting_with_media_1b5d173d.txt @@ -0,0 +1,5 @@ +URL: https://ai.google.dev/gemini-api/docs/prompting_with_media?lang=python#prompting-with-videos +Title: Files API  |  Gemini API  |  Google AI for Developers +================================================== + +Files API | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Files API The Gemini family of artificial intelligence (AI) models is built to handle various types of input data, including text, images, and audio. Since these models can handle more than one type or mode of data, the Gemini models are called multimodal models or explained as having multimodal capabilities . This guide shows you how to work with media files using the Files API. The basic operations are the same for audio files, images, videos, documents, and other supported file types. For file prompting guidance, check out the File prompt guide section. Upload a file You can use the Files API to upload a media file. Always use the Files API when the total request size (including the files, text prompt, system instructions, etc.) is larger than 20 MB. The following code uploads a file and then uses the file in a call to generateContent . Python from google import genai client = genai . Client () myfile = client . files . upload ( file = "path/to/sample.mp3" ) response = client . models . generate_content ( model = "gemini-2.0-flash" , contents = [ "Describe this audio clip" , myfile ] ) print ( response . text ) JavaScript import { GoogleGenAI , createUserContent , createPartFromUri , } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const myfile = await ai . files . upload ({ file : "path/to/sample.mp3" , config : { mimeType : "audio/mpeg" }, }); const response = await ai . models . generateContent ({ model : "gemini-2.0-flash" , contents : createUserContent ([ createPartFromUri ( myfile . uri , myfile . mimeType ), "Describe this audio clip" , ]), }); console . log ( response . text ); } await main (); Go file , err := client . UploadFileFromPath ( ctx , "path/to/sample.mp3" , nil ) if err != nil { log . Fatal ( err ) } defer client . DeleteFile ( ctx , file . Name ) model := client . GenerativeModel ( "gemini-2.0-flash" ) resp , err := model . GenerateContent ( ctx , genai . FileData { URI : file . URI }, genai . Text ( "Describe this audio clip" )) if err != nil { log . Fatal ( err ) } printResponse ( resp ) REST AUDIO_PATH = "path/to/sample.mp3" MIME_TYPE = $( file -b --mime-type " ${ AUDIO_PATH } " ) NUM_BYTES = $( wc -c < " ${ AUDIO_PATH } " ) DISPLAY_NAME = AUDIO tmp_header_file = upload-header.tmp # Initial resumable request defining metadata. # The upload url is in the response headers dump them to a file. curl " ${ BASE_URL } /upload/v1beta/files" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -D " ${ tmp_header_file } " \ -H "X-Goog-Upload-Protocol: resumable" \ -H "X-Goog-Upload-Command: start" \ -H "X-Goog-Upload-Header-Content-Length: ${ NUM_BYTES } " \ -H "X-Goog-Upload-Header-Content-Type: ${ MIME_TYPE } " \ -H "Content-Type: application/json" \ -d "{'file': {'display_name': ' ${ DISPLAY_NAME } '}}" 2 > /dev/null upload_url = $( grep -i "x-goog-upload-url: " " ${ tmp_header_file } " | cut -d " " -f2 | tr -d "\r" ) rm " ${ tmp_header_file } " # Upload the actual bytes. curl " ${ upload_url } " \ -H "Content-Length: ${ NUM_BYTES } " \ -H "X-Goog-Upload-Offset: 0" \ -H "X-Goog-Upload-Command: upload, finalize" \ --data-binary "@ ${ AUDIO_PATH } " 2 > /dev/null > file_info.json file_uri = $( jq ".file.uri" file_info.json ) echo file_uri = $file_uri # Now generate content using that file curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [{ "parts":[ {"text": "Describe this audio clip"}, {"file_data":{"mime_type": "${MIME_TYPE}", "file_uri": ' $file_uri '}}] }] }' 2 > /dev/null > response.json cat response.json echo jq ".candidates[].content.parts[].text" response.json Get metadata for a file You can verify that the API successfully stored the uploaded file and get its metadata by calling files.get . Python myfile = client . files . upload ( file = 'path/to/sample.mp3' ) file_name = myfile . name myfile = client . files . get ( name = file_name ) print ( myfile ) JavaScript const myfile = await ai . files . upload ({ file : "path/to/sample.mp3" , config : { mimeType : "audio/mpeg" }, }); const fileName = myfile . name ; const fetchedFile = await ai . files . get ({ name : fileName }); console . log ( fetchedFile ); Go file , err := client . UploadFileFromPath ( ctx , "path/to/sample.mp3" , nil ) if err != nil { log . Fatal ( err ) } gotFile , err := client . GetFile ( ctx , file . Name ) if err != nil { log . Fatal ( err ) } fmt . Println ( "Got file:" , gotFile . Name ) REST # file_info.json was created in the upload example name = $( jq ".file.name" file_info.json ) # Get the file of interest to check state curl https://generativelanguage.googleapis.com/v1beta/files/ $name \ -H "x-goog-api-key: $GEMINI_API_KEY " > file_info.json # Print some information about the file you got name = $( jq ".file.name" file_info.json ) echo name = $name file_uri = $( jq ".file.uri" file_info.json ) echo file_uri = $file_uri List uploaded files You can upload multiple files using the Files API. The following code gets a list of all the files uploaded: Python print ( 'My files:' ) for f in client . files . list (): print ( ' ' , f . name ) JavaScript const listResponse = await ai . files . list ({ config : { pageSize : 10 } }); for await ( const file of listResponse ) { console . log ( file . name ); } Go iter := client . ListFiles ( ctx ) for { ifile , err := iter . Next () if err == iterator . Done { break } if err != nil { log . Fatal ( err ) } fmt . Println ( ifile . Name ) } REST echo "My files: " curl "https://generativelanguage.googleapis.com/v1beta/files" \ -H "x-goog-api-key: $GEMINI_API_KEY " Delete uploaded files Files are automatically deleted after 48 hours. You can also manually delete an uploaded file: Python myfile = client . files . upload ( file = 'path/to/sample.mp3' ) client . files . delete ( name = myfile . name ) JavaScript const myfile = await ai . files . upload ({ file : "path/to/sample.mp3" , config : { mimeType : "audio/mpeg" }, }); const fileName = myfile . name ; await ai . files . delete ({ name : fileName }); Go file , err := client . UploadFileFromPath ( ctx , "path/to/sample.mp3" , nil ) if err != nil { log . Fatal ( err ) } client . DeleteFile ( ctx , file . Name ) REST curl --request "DELETE" https://generativelanguage.googleapis.com/v1beta/files/ $name \ -H "x-goog-api-key: $GEMINI_API_KEY " Usage info You can use the Files API to upload and interact with media files. The Files API lets you store up to 20 GB of files per project, with a per-file maximum size of 2 GB. Files are stored for 48 hours. During that time, you can use the API to get metadata about the files, but you can't download the files. The Files API is available at no cost in all regions where the Gemini API is available. File prompting strategies This section provides guidance and best practices for using media files with prompts for the Gemini API. Being able to use various types of data in your prompts gives you more flexibility in terms of what tasks you can tackle with the Gemini API. For example, you can send the model a photo of a delicious meal and ask it to write a short blog about the meal. Prompt Response Write a short, engaging blog post based on this picture. It should include a description of the meal in the photo and talk about my journey meal prepping. Meal prepping is a great way to save time and money, and it can also help you to eat healthier. This meal is a great example of a healthy and delicious meal that can be easily prepped ahead of time. This meal features brown rice, roasted vegetables, and chicken teriyaki. The brown rice is a healthy and complex carbohydrate that will give you sustained energy. The roasted vegetables are a great way to get your daily dose of vitamins and minerals, and the chicken teriyaki is a delicious and protein-rich option. This meal is also very easy to prepare. Simply cook the brown rice, roast the vegetables, and cook the chicken teriyaki. Once everything is cooked, divide it into meal prep containers and store them in the refrigerator. You can then grab a container and go on busy mornings! If you're looking for a healthy and delicious meal that can be easily prepped ahead of time, this meal is a great option. It's packed with nutrients and flavor, and it's sure to keep you feeling full and satisfied. Here's to healthy and delicious meal prepping! If you are having trouble getting the output you want from prompts that use media files, there are some strategies that can help you get the results you want. The following sections provide design approaches and troubleshooting tips for improving prompts that use multimodal input. You can improve your multimodal prompts by following these best practices: Prompt design fundamentals Be specific in your instructions : Craft clear and concise instructions that leave minimal room for misinterpretation. Add a few examples to your prompt: Use realistic few-shot examples to illustrate what you want to achieve. Break it down step-by-step : Divide complex tasks into manageable sub-goals, guiding the model through the process. Specify the output format : In your prompt, ask for the output to be in the format you want, like markdown, JSON, HTML and more. Put your image first for single-image prompts : While Gemini can handle image and text inputs in any order, for prompts containing a single image, it might perform better if that image (or video) is placed before the text prompt. However, for prompts that require images to be highly interleaved with texts to make sense, use whatever order is most natural. Troubleshooting your multimodal prompt If the model is not drawing information from the relevant part of the image: Drop hints with which aspects of the image you want the prompt to draw information from. If the model output is too generic (not tailored enough to the image/video input): At the start of the prompt, try asking the model to describe the image(s) or video before providing the task instruction, or try asking the model to refer to what's in the image. To troubleshoot which part failed: Ask the model to describe the image, or ask the model to explain its reasoning, to gauge the model's initial understanding. If your prompt results in hallucinated content: Try dialing down the temperature setting or asking the model for shorter descriptions so that it's less likely to extrapolate additional details. Tuning the sampling parameters: Experiment with different temperature settings and top-k selections to adjust the model's creativity. Be specific in your instructions Prompts have the most success when they are clear and detailed. If you have a specific output in mind, it's better to include that requirement in the prompt to ensure you get the output you want. For this image of an airport board, asking the model to just "describe this image" could generate a general description. If you need the model to parse the time and city from the image, you can include that request directly in your prompt. Prompt Model response Describe this image. The image shows an airport arrivals and departures board. Updated prompt Improved response Parse the time and city from the airport board shown in this image into a list. 10:50 Moscow 11:05 Edinburgh 11:05 London 11:10 Bucharest 11:30 Kiev 11:35 Dublin 11:45 East Midlands 12:15 Sofia 12:30 London 12:30 Newcastle 12:40 St Petersburg 12:40 London 12:45 Manchester Add a few examples The Gemini model can accept multiple inputs which it can use as examples to understand the output you want. Adding these examples can help the model identify the patterns and apply the relationship between the given images and responses to the new example. This is also called "few-shot" learning. In the example below, the initial output is written in sentence form, and also contains the country (Brazil). Suppose you want the output in a different format or style, and you want only the city, not the country. Adding few-shot examples to your prompt can steer the model to respond in the way you want. Prompt Model response Determine the city along with the landmark. The landmark is the Christ the Redeemer statue in Rio de Janeiro, Brazil. Updated prompt Improved response Determine the city along with the landmark. city: Rome, landmark: the Colosseum. city: Beijing, landmark: Forbidden City city: Rio de Janeiro, landmark: Christ the Redeemer statue Break it down step-by-step For complex tasks like the ones that require both visual understanding and reasoning, it can be helpful to split the task into smaller, more straightforward steps. Alternatively, it could also be effective if you directly ask the model to “think step by step” in your prompt. Prompt Model response When will I run out of toilet paper? Soon, you only have 3 rolls left. Updated prompt Improved response 1. First, count how many toilet paper rolls are in this picture. 2. Then, determine how much toilet paper a typical person uses per day. 3. Calculate how long these rolls of toilet paper will last. 1. There are 3 rolls of toilet paper in this picture. 2. A typical person uses about 20 sheets of toilet paper per day. 3. If each roll contains 200 sheets, then each roll will last for about 10 days. Therefore, the 3 rolls will last for about a month. Math problems or other types of word problems are great candidates for asking the model to think step-by-step. Prompt Response What is the 4th term in the sequence? -135 The response from the model is incorrect. Some ways to improve this is to ask the model to parse the problem before answering the prompt. Prompt Response Parse the formula in the image first. Then based on the formula, answer "what's the 4th term in the sequence?". Click to expand the result $$ b(n) = b(n-1) * (-3) $$ $$ b(1) = 15 $$ Substitute n = 4 into the formula: $$ b(4) = b(4-1) * (-3) $$ $$ b(4) = b(3) * (-3) $$ We don't know the value of b(3), so substitute n = 3 into the formula: $$ b(3) = b(3-1) * (-3) $$ $$ b(3) = b(2) * (-3) $$ We don't know the value of b(2), so substitute n = 2 into the formula: $$ b(2) = b(2-1) * (-3) $$ $$ b(2) = b(1) * (-3) $$ We know the value of b(1) is 15, so substitute it into the formula: $$ b(2) = 15 * (-3) $$ $$ b(2) = -45 $$ Now that we know the value of b(2), substitute it into the formula for b(3): $$ b(3) = b(2) * (-3) $$ $$ b(3) = -45 * (-3) $$ $$ b(3) = 135 $$ Now that we know the value of b(3), substitute it into the formula for b(4): $$ b(4) = b(3) * (-3) $$ $$ b(4) = 135 * (-3) $$ $$ b(4) = -405 $$ Therefore, the 4th term in the sequence is -405. This model response is correct - now to make the response even more streamlined, you can ask the model to to think step-by-step. Updated prompt Improved response What is the 4th term in the sequence? Think step by step. The first term is 15. The second term is b(1) * (-3) = 15 * (-3) = -45. The third term is b(2) * (-3) = -45 * (-3) = 135. The fourth term is b(3) * (-3) = 135 * (-3) = -405. Specify the output format A common problem is the need for model output to be in a certain format (e.g. markdown, JSON, HTML), especially if the model output needs to be ingested by a downstream task. You can try to produce output in that format by instructing the model to do so within the prompt. Below are two examples: Prompt Model response Parse the table in this image into markdown format | Subject | Status | Time | Start date | Due on | | ------- | ------ | ---- | ---------- | ------ | | Math | Done | 1.5 hrs | 11/29/2023 | 12/1/2023 | | Geography | Done | 30 min | 12/2/2023 | 12/4/2023 | | Physics | In progress | 2 hrs | 12/3/2023 | 12/5/2023 | | Biology | Not started | 6 hrs | 11/2/2016 | 12/6/2023 | Prompt Model response Provide a list of all the following attributes: ingredients, type of cuisine, vegetarian or not, in JSON format ```json { "ingredients": [ "rice", "shrimp", "clams", "mussels", "peas", "tomatoes", "onions", "garlic", "olive oil", "paprika", "salt", "pepper" ], "type of cuisine": "Spanish", "vegetarian": false } ``` Put your image first for single-image prompts While Gemini can interpret image and text in any order within a prompt, placing a single image before the text prompt might lead to better results. In the following samples, you'll notice the image comes first before the text in each prompt. Troubleshooting your multimodal prompt You might need to troubleshoot your prompt if you are not getting a helpful response. Here are a few strategies you could try. If the model is not drawing information from the relevant part of the image To get a more specific response, you can point out which aspects of the image you want the prompt to draw information from to inform its response. Prompt Model response How many days will these diapers last a baby? The diapers will last for 198 days before they run out. Updated prompt Improved response How long will these diapers last before I run out? Use the weight shown on the box to determine the child's age, and use the total number of diapers in the box. Divide the total number by how many diapers the child goes through per day. The box contains 198 diapers. The box states the diapers are for a child weighing 22-37 lbs. The average child in this weight range is likely between 1 and 2 years old. A 2-year-old typically goes through 6 diapers per day, therefore, the diapers will last around 33 days (198/6). If the model output is too generic and not tailored enough to the image input To help the model tailor its response to the image(s), try asking it to describe the images before performing its reasoning task. Prompt Model response What is in common between these images? All four images are examples of landscape photography. Updated prompt Improved response First, describe what's in each image in detail. What's in common between these images? The first image shows a person packing a suitcase. The second image shows an empty hallway in a shopping mall. The third image shows a roll of toilet paper. The fourth image shows a dark and stormy sea. The commonality between these images is that they are all examples of liminal spaces. Liminal spaces are places or situations that are in between two other places or situations. They are often characterized by a sense of uncertainty or ambiguity. The first image shows the person in between being at home and being on a trip. The second image shows the hallway in between two stores. The third image is of toilet paper, which can represent a bathroom. A bathroom is a transitional space between private and public realms. It's a place where we prepare ourselves to enter or leave the outside world. The fourth image shows the sea in between being calm and being stormy. Liminal spaces can be both exciting and unsettling. They can be a place of new beginning or a place of endings. They can be a place of reflection or a place of action. You can also try explicitly asking the model to refer to what's in the image. Updated prompt Improved response What is in common between these images? Refer to what's in the images in your response. All of the images are related to travel. The first image shows a person packing a suitcase, the second image shows an empty hallway in a shopping mall, the third image shows a stack of toilet paper, and the fourth image shows a dark and stormy sea. These images are all related to travel because they are all things that people need or use when they travel. The first image shows a person packing a suitcase, which is something that people do when they are going on a trip. The second image shows an empty hallway in a shopping mall, which is a place where people often go to buy things for their trips. The third image shows a stack of toilet paper, which is something that people often use when they are traveling. The fourth image shows a dark and stormy sea, which is something that people often encounter when they are traveling. The same tips work for creative output from images. In the following example with the rabbit, explicitly asking the model to include aspects from the image can improve the output. Prompt Model response Can you write me a poem about this image? Little bunny, so soft and brown, Hopping through the field, up and down. Your ears so long, your tail so small, You're the cutest creature of them all. You nibble on the grass, so green and sweet, And hop away when you see me. But I'll always be your friend, Little bunny, until the very end. Updated prompt Improved response Can you write me a descriptive and dramatic poem about this image and include the location? Little bunny on the green, Ears so long, so keen. What do you see out there, In the distance, where you stare? Is it the ocean blue, Sparkling in the sun's hue? Or the waves that crash and roar, A rhythm that you adore? Little bunny, tell me true, What captures your attention so? Is it the salty breeze, Or the seagulls' cries that please? Perhaps it's the vastness of it all, A world so big and small. A place where dreams can roam, And adventures can be sown. Little bunny, I wonder what you dream, As you sit on the grass, so serene. Do you yearn to explore the deep, Or stay on land, where you can leap? Whatever it may be, little bunny, Keep that spark of wonder burning brightly. For in your dreams and aspirations, Lies a world waiting for your creation. Troubleshooting which part of the prompt failed It can be hard to know whether a prompt failed because the model didn't understand the image to begin with, or if it did understand the image but did not perform the correct reasoning steps afterward. To disambiguate between those reasons, ask the model to describe what's in the image. In the following example, if the model responds with a snack that seems surprising when paired with tea (e.g. popcorn), you can first troubleshoot to determine whether the model correctly recognized that the image contains tea. Prompt Prompt for troubleshooting What's a snack I can make in 1 minute that would go well with this? Describe what's in this image. Another strategy is to ask the model to explain its reasoning. That can help you narrow down which part of the reasoning broke down, if any. Prompt Prompt for troubleshooting What's a snack I can make in 1 minute that would go well with this? What's a snack I can make in 1 minute that would go well with this? Please explain why. What's next Try writing your own multimodal prompts using Google AI Studio . For information on using the Gemini Files API for uploading media files and including them in your prompts, see the Vision , Audio , and Document processing guides. For more guidance on prompt design, like tuning sampling parameters, see the Prompt strategies page. Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-06-27 UTC. \ No newline at end of file diff --git a/text_content/docs_prompting_with_media_94014c4b.txt b/text_content/docs_prompting_with_media_94014c4b.txt new file mode 100644 index 0000000000000000000000000000000000000000..a0e44a103df192b296fb651369adfe6d6514139f --- /dev/null +++ b/text_content/docs_prompting_with_media_94014c4b.txt @@ -0,0 +1,5 @@ +URL: https://ai.google.dev/gemini-api/docs/prompting_with_media +Title: Files API  |  Gemini API  |  Google AI for Developers +================================================== + +Files API | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Files API The Gemini family of artificial intelligence (AI) models is built to handle various types of input data, including text, images, and audio. Since these models can handle more than one type or mode of data, the Gemini models are called multimodal models or explained as having multimodal capabilities . This guide shows you how to work with media files using the Files API. The basic operations are the same for audio files, images, videos, documents, and other supported file types. For file prompting guidance, check out the File prompt guide section. Upload a file You can use the Files API to upload a media file. Always use the Files API when the total request size (including the files, text prompt, system instructions, etc.) is larger than 20 MB. The following code uploads a file and then uses the file in a call to generateContent . Python from google import genai client = genai . Client () myfile = client . files . upload ( file = "path/to/sample.mp3" ) response = client . models . generate_content ( model = "gemini-2.0-flash" , contents = [ "Describe this audio clip" , myfile ] ) print ( response . text ) JavaScript import { GoogleGenAI , createUserContent , createPartFromUri , } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const myfile = await ai . files . upload ({ file : "path/to/sample.mp3" , config : { mimeType : "audio/mpeg" }, }); const response = await ai . models . generateContent ({ model : "gemini-2.0-flash" , contents : createUserContent ([ createPartFromUri ( myfile . uri , myfile . mimeType ), "Describe this audio clip" , ]), }); console . log ( response . text ); } await main (); Go file , err := client . UploadFileFromPath ( ctx , "path/to/sample.mp3" , nil ) if err != nil { log . Fatal ( err ) } defer client . DeleteFile ( ctx , file . Name ) model := client . GenerativeModel ( "gemini-2.0-flash" ) resp , err := model . GenerateContent ( ctx , genai . FileData { URI : file . URI }, genai . Text ( "Describe this audio clip" )) if err != nil { log . Fatal ( err ) } printResponse ( resp ) REST AUDIO_PATH = "path/to/sample.mp3" MIME_TYPE = $( file -b --mime-type " ${ AUDIO_PATH } " ) NUM_BYTES = $( wc -c < " ${ AUDIO_PATH } " ) DISPLAY_NAME = AUDIO tmp_header_file = upload-header.tmp # Initial resumable request defining metadata. # The upload url is in the response headers dump them to a file. curl " ${ BASE_URL } /upload/v1beta/files" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -D " ${ tmp_header_file } " \ -H "X-Goog-Upload-Protocol: resumable" \ -H "X-Goog-Upload-Command: start" \ -H "X-Goog-Upload-Header-Content-Length: ${ NUM_BYTES } " \ -H "X-Goog-Upload-Header-Content-Type: ${ MIME_TYPE } " \ -H "Content-Type: application/json" \ -d "{'file': {'display_name': ' ${ DISPLAY_NAME } '}}" 2 > /dev/null upload_url = $( grep -i "x-goog-upload-url: " " ${ tmp_header_file } " | cut -d " " -f2 | tr -d "\r" ) rm " ${ tmp_header_file } " # Upload the actual bytes. curl " ${ upload_url } " \ -H "Content-Length: ${ NUM_BYTES } " \ -H "X-Goog-Upload-Offset: 0" \ -H "X-Goog-Upload-Command: upload, finalize" \ --data-binary "@ ${ AUDIO_PATH } " 2 > /dev/null > file_info.json file_uri = $( jq ".file.uri" file_info.json ) echo file_uri = $file_uri # Now generate content using that file curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [{ "parts":[ {"text": "Describe this audio clip"}, {"file_data":{"mime_type": "${MIME_TYPE}", "file_uri": ' $file_uri '}}] }] }' 2 > /dev/null > response.json cat response.json echo jq ".candidates[].content.parts[].text" response.json Get metadata for a file You can verify that the API successfully stored the uploaded file and get its metadata by calling files.get . Python myfile = client . files . upload ( file = 'path/to/sample.mp3' ) file_name = myfile . name myfile = client . files . get ( name = file_name ) print ( myfile ) JavaScript const myfile = await ai . files . upload ({ file : "path/to/sample.mp3" , config : { mimeType : "audio/mpeg" }, }); const fileName = myfile . name ; const fetchedFile = await ai . files . get ({ name : fileName }); console . log ( fetchedFile ); Go file , err := client . UploadFileFromPath ( ctx , "path/to/sample.mp3" , nil ) if err != nil { log . Fatal ( err ) } gotFile , err := client . GetFile ( ctx , file . Name ) if err != nil { log . Fatal ( err ) } fmt . Println ( "Got file:" , gotFile . Name ) REST # file_info.json was created in the upload example name = $( jq ".file.name" file_info.json ) # Get the file of interest to check state curl https://generativelanguage.googleapis.com/v1beta/files/ $name \ -H "x-goog-api-key: $GEMINI_API_KEY " > file_info.json # Print some information about the file you got name = $( jq ".file.name" file_info.json ) echo name = $name file_uri = $( jq ".file.uri" file_info.json ) echo file_uri = $file_uri List uploaded files You can upload multiple files using the Files API. The following code gets a list of all the files uploaded: Python print ( 'My files:' ) for f in client . files . list (): print ( ' ' , f . name ) JavaScript const listResponse = await ai . files . list ({ config : { pageSize : 10 } }); for await ( const file of listResponse ) { console . log ( file . name ); } Go iter := client . ListFiles ( ctx ) for { ifile , err := iter . Next () if err == iterator . Done { break } if err != nil { log . Fatal ( err ) } fmt . Println ( ifile . Name ) } REST echo "My files: " curl "https://generativelanguage.googleapis.com/v1beta/files" \ -H "x-goog-api-key: $GEMINI_API_KEY " Delete uploaded files Files are automatically deleted after 48 hours. You can also manually delete an uploaded file: Python myfile = client . files . upload ( file = 'path/to/sample.mp3' ) client . files . delete ( name = myfile . name ) JavaScript const myfile = await ai . files . upload ({ file : "path/to/sample.mp3" , config : { mimeType : "audio/mpeg" }, }); const fileName = myfile . name ; await ai . files . delete ({ name : fileName }); Go file , err := client . UploadFileFromPath ( ctx , "path/to/sample.mp3" , nil ) if err != nil { log . Fatal ( err ) } client . DeleteFile ( ctx , file . Name ) REST curl --request "DELETE" https://generativelanguage.googleapis.com/v1beta/files/ $name \ -H "x-goog-api-key: $GEMINI_API_KEY " Usage info You can use the Files API to upload and interact with media files. The Files API lets you store up to 20 GB of files per project, with a per-file maximum size of 2 GB. Files are stored for 48 hours. During that time, you can use the API to get metadata about the files, but you can't download the files. The Files API is available at no cost in all regions where the Gemini API is available. File prompting strategies This section provides guidance and best practices for using media files with prompts for the Gemini API. Being able to use various types of data in your prompts gives you more flexibility in terms of what tasks you can tackle with the Gemini API. For example, you can send the model a photo of a delicious meal and ask it to write a short blog about the meal. Prompt Response Write a short, engaging blog post based on this picture. It should include a description of the meal in the photo and talk about my journey meal prepping. Meal prepping is a great way to save time and money, and it can also help you to eat healthier. This meal is a great example of a healthy and delicious meal that can be easily prepped ahead of time. This meal features brown rice, roasted vegetables, and chicken teriyaki. The brown rice is a healthy and complex carbohydrate that will give you sustained energy. The roasted vegetables are a great way to get your daily dose of vitamins and minerals, and the chicken teriyaki is a delicious and protein-rich option. This meal is also very easy to prepare. Simply cook the brown rice, roast the vegetables, and cook the chicken teriyaki. Once everything is cooked, divide it into meal prep containers and store them in the refrigerator. You can then grab a container and go on busy mornings! If you're looking for a healthy and delicious meal that can be easily prepped ahead of time, this meal is a great option. It's packed with nutrients and flavor, and it's sure to keep you feeling full and satisfied. Here's to healthy and delicious meal prepping! If you are having trouble getting the output you want from prompts that use media files, there are some strategies that can help you get the results you want. The following sections provide design approaches and troubleshooting tips for improving prompts that use multimodal input. You can improve your multimodal prompts by following these best practices: Prompt design fundamentals Be specific in your instructions : Craft clear and concise instructions that leave minimal room for misinterpretation. Add a few examples to your prompt: Use realistic few-shot examples to illustrate what you want to achieve. Break it down step-by-step : Divide complex tasks into manageable sub-goals, guiding the model through the process. Specify the output format : In your prompt, ask for the output to be in the format you want, like markdown, JSON, HTML and more. Put your image first for single-image prompts : While Gemini can handle image and text inputs in any order, for prompts containing a single image, it might perform better if that image (or video) is placed before the text prompt. However, for prompts that require images to be highly interleaved with texts to make sense, use whatever order is most natural. Troubleshooting your multimodal prompt If the model is not drawing information from the relevant part of the image: Drop hints with which aspects of the image you want the prompt to draw information from. If the model output is too generic (not tailored enough to the image/video input): At the start of the prompt, try asking the model to describe the image(s) or video before providing the task instruction, or try asking the model to refer to what's in the image. To troubleshoot which part failed: Ask the model to describe the image, or ask the model to explain its reasoning, to gauge the model's initial understanding. If your prompt results in hallucinated content: Try dialing down the temperature setting or asking the model for shorter descriptions so that it's less likely to extrapolate additional details. Tuning the sampling parameters: Experiment with different temperature settings and top-k selections to adjust the model's creativity. Be specific in your instructions Prompts have the most success when they are clear and detailed. If you have a specific output in mind, it's better to include that requirement in the prompt to ensure you get the output you want. For this image of an airport board, asking the model to just "describe this image" could generate a general description. If you need the model to parse the time and city from the image, you can include that request directly in your prompt. Prompt Model response Describe this image. The image shows an airport arrivals and departures board. Updated prompt Improved response Parse the time and city from the airport board shown in this image into a list. 10:50 Moscow 11:05 Edinburgh 11:05 London 11:10 Bucharest 11:30 Kiev 11:35 Dublin 11:45 East Midlands 12:15 Sofia 12:30 London 12:30 Newcastle 12:40 St Petersburg 12:40 London 12:45 Manchester Add a few examples The Gemini model can accept multiple inputs which it can use as examples to understand the output you want. Adding these examples can help the model identify the patterns and apply the relationship between the given images and responses to the new example. This is also called "few-shot" learning. In the example below, the initial output is written in sentence form, and also contains the country (Brazil). Suppose you want the output in a different format or style, and you want only the city, not the country. Adding few-shot examples to your prompt can steer the model to respond in the way you want. Prompt Model response Determine the city along with the landmark. The landmark is the Christ the Redeemer statue in Rio de Janeiro, Brazil. Updated prompt Improved response Determine the city along with the landmark. city: Rome, landmark: the Colosseum. city: Beijing, landmark: Forbidden City city: Rio de Janeiro, landmark: Christ the Redeemer statue Break it down step-by-step For complex tasks like the ones that require both visual understanding and reasoning, it can be helpful to split the task into smaller, more straightforward steps. Alternatively, it could also be effective if you directly ask the model to “think step by step” in your prompt. Prompt Model response When will I run out of toilet paper? Soon, you only have 3 rolls left. Updated prompt Improved response 1. First, count how many toilet paper rolls are in this picture. 2. Then, determine how much toilet paper a typical person uses per day. 3. Calculate how long these rolls of toilet paper will last. 1. There are 3 rolls of toilet paper in this picture. 2. A typical person uses about 20 sheets of toilet paper per day. 3. If each roll contains 200 sheets, then each roll will last for about 10 days. Therefore, the 3 rolls will last for about a month. Math problems or other types of word problems are great candidates for asking the model to think step-by-step. Prompt Response What is the 4th term in the sequence? -135 The response from the model is incorrect. Some ways to improve this is to ask the model to parse the problem before answering the prompt. Prompt Response Parse the formula in the image first. Then based on the formula, answer "what's the 4th term in the sequence?". Click to expand the result $$ b(n) = b(n-1) * (-3) $$ $$ b(1) = 15 $$ Substitute n = 4 into the formula: $$ b(4) = b(4-1) * (-3) $$ $$ b(4) = b(3) * (-3) $$ We don't know the value of b(3), so substitute n = 3 into the formula: $$ b(3) = b(3-1) * (-3) $$ $$ b(3) = b(2) * (-3) $$ We don't know the value of b(2), so substitute n = 2 into the formula: $$ b(2) = b(2-1) * (-3) $$ $$ b(2) = b(1) * (-3) $$ We know the value of b(1) is 15, so substitute it into the formula: $$ b(2) = 15 * (-3) $$ $$ b(2) = -45 $$ Now that we know the value of b(2), substitute it into the formula for b(3): $$ b(3) = b(2) * (-3) $$ $$ b(3) = -45 * (-3) $$ $$ b(3) = 135 $$ Now that we know the value of b(3), substitute it into the formula for b(4): $$ b(4) = b(3) * (-3) $$ $$ b(4) = 135 * (-3) $$ $$ b(4) = -405 $$ Therefore, the 4th term in the sequence is -405. This model response is correct - now to make the response even more streamlined, you can ask the model to to think step-by-step. Updated prompt Improved response What is the 4th term in the sequence? Think step by step. The first term is 15. The second term is b(1) * (-3) = 15 * (-3) = -45. The third term is b(2) * (-3) = -45 * (-3) = 135. The fourth term is b(3) * (-3) = 135 * (-3) = -405. Specify the output format A common problem is the need for model output to be in a certain format (e.g. markdown, JSON, HTML), especially if the model output needs to be ingested by a downstream task. You can try to produce output in that format by instructing the model to do so within the prompt. Below are two examples: Prompt Model response Parse the table in this image into markdown format | Subject | Status | Time | Start date | Due on | | ------- | ------ | ---- | ---------- | ------ | | Math | Done | 1.5 hrs | 11/29/2023 | 12/1/2023 | | Geography | Done | 30 min | 12/2/2023 | 12/4/2023 | | Physics | In progress | 2 hrs | 12/3/2023 | 12/5/2023 | | Biology | Not started | 6 hrs | 11/2/2016 | 12/6/2023 | Prompt Model response Provide a list of all the following attributes: ingredients, type of cuisine, vegetarian or not, in JSON format ```json { "ingredients": [ "rice", "shrimp", "clams", "mussels", "peas", "tomatoes", "onions", "garlic", "olive oil", "paprika", "salt", "pepper" ], "type of cuisine": "Spanish", "vegetarian": false } ``` Put your image first for single-image prompts While Gemini can interpret image and text in any order within a prompt, placing a single image before the text prompt might lead to better results. In the following samples, you'll notice the image comes first before the text in each prompt. Troubleshooting your multimodal prompt You might need to troubleshoot your prompt if you are not getting a helpful response. Here are a few strategies you could try. If the model is not drawing information from the relevant part of the image To get a more specific response, you can point out which aspects of the image you want the prompt to draw information from to inform its response. Prompt Model response How many days will these diapers last a baby? The diapers will last for 198 days before they run out. Updated prompt Improved response How long will these diapers last before I run out? Use the weight shown on the box to determine the child's age, and use the total number of diapers in the box. Divide the total number by how many diapers the child goes through per day. The box contains 198 diapers. The box states the diapers are for a child weighing 22-37 lbs. The average child in this weight range is likely between 1 and 2 years old. A 2-year-old typically goes through 6 diapers per day, therefore, the diapers will last around 33 days (198/6). If the model output is too generic and not tailored enough to the image input To help the model tailor its response to the image(s), try asking it to describe the images before performing its reasoning task. Prompt Model response What is in common between these images? All four images are examples of landscape photography. Updated prompt Improved response First, describe what's in each image in detail. What's in common between these images? The first image shows a person packing a suitcase. The second image shows an empty hallway in a shopping mall. The third image shows a roll of toilet paper. The fourth image shows a dark and stormy sea. The commonality between these images is that they are all examples of liminal spaces. Liminal spaces are places or situations that are in between two other places or situations. They are often characterized by a sense of uncertainty or ambiguity. The first image shows the person in between being at home and being on a trip. The second image shows the hallway in between two stores. The third image is of toilet paper, which can represent a bathroom. A bathroom is a transitional space between private and public realms. It's a place where we prepare ourselves to enter or leave the outside world. The fourth image shows the sea in between being calm and being stormy. Liminal spaces can be both exciting and unsettling. They can be a place of new beginning or a place of endings. They can be a place of reflection or a place of action. You can also try explicitly asking the model to refer to what's in the image. Updated prompt Improved response What is in common between these images? Refer to what's in the images in your response. All of the images are related to travel. The first image shows a person packing a suitcase, the second image shows an empty hallway in a shopping mall, the third image shows a stack of toilet paper, and the fourth image shows a dark and stormy sea. These images are all related to travel because they are all things that people need or use when they travel. The first image shows a person packing a suitcase, which is something that people do when they are going on a trip. The second image shows an empty hallway in a shopping mall, which is a place where people often go to buy things for their trips. The third image shows a stack of toilet paper, which is something that people often use when they are traveling. The fourth image shows a dark and stormy sea, which is something that people often encounter when they are traveling. The same tips work for creative output from images. In the following example with the rabbit, explicitly asking the model to include aspects from the image can improve the output. Prompt Model response Can you write me a poem about this image? Little bunny, so soft and brown, Hopping through the field, up and down. Your ears so long, your tail so small, You're the cutest creature of them all. You nibble on the grass, so green and sweet, And hop away when you see me. But I'll always be your friend, Little bunny, until the very end. Updated prompt Improved response Can you write me a descriptive and dramatic poem about this image and include the location? Little bunny on the green, Ears so long, so keen. What do you see out there, In the distance, where you stare? Is it the ocean blue, Sparkling in the sun's hue? Or the waves that crash and roar, A rhythm that you adore? Little bunny, tell me true, What captures your attention so? Is it the salty breeze, Or the seagulls' cries that please? Perhaps it's the vastness of it all, A world so big and small. A place where dreams can roam, And adventures can be sown. Little bunny, I wonder what you dream, As you sit on the grass, so serene. Do you yearn to explore the deep, Or stay on land, where you can leap? Whatever it may be, little bunny, Keep that spark of wonder burning brightly. For in your dreams and aspirations, Lies a world waiting for your creation. Troubleshooting which part of the prompt failed It can be hard to know whether a prompt failed because the model didn't understand the image to begin with, or if it did understand the image but did not perform the correct reasoning steps afterward. To disambiguate between those reasons, ask the model to describe what's in the image. In the following example, if the model responds with a snack that seems surprising when paired with tea (e.g. popcorn), you can first troubleshoot to determine whether the model correctly recognized that the image contains tea. Prompt Prompt for troubleshooting What's a snack I can make in 1 minute that would go well with this? Describe what's in this image. Another strategy is to ask the model to explain its reasoning. That can help you narrow down which part of the reasoning broke down, if any. Prompt Prompt for troubleshooting What's a snack I can make in 1 minute that would go well with this? What's a snack I can make in 1 minute that would go well with this? Please explain why. What's next Try writing your own multimodal prompts using Google AI Studio . For information on using the Gemini Files API for uploading media files and including them in your prompts, see the Vision , Audio , and Document processing guides. For more guidance on prompt design, like tuning sampling parameters, see the Prompt strategies page. Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-06-27 UTC. \ No newline at end of file diff --git a/text_content/docs_prompting_with_media_ce7bde69.txt b/text_content/docs_prompting_with_media_ce7bde69.txt new file mode 100644 index 0000000000000000000000000000000000000000..8fa7855f7e28799362890d7913691de3ba95cce2 --- /dev/null +++ b/text_content/docs_prompting_with_media_ce7bde69.txt @@ -0,0 +1,5 @@ +URL: https://ai.google.dev/gemini-api/docs/prompting_with_media#troubleshooting +Title: Files API  |  Gemini API  |  Google AI for Developers +================================================== + +Files API | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Files API The Gemini family of artificial intelligence (AI) models is built to handle various types of input data, including text, images, and audio. Since these models can handle more than one type or mode of data, the Gemini models are called multimodal models or explained as having multimodal capabilities . This guide shows you how to work with media files using the Files API. The basic operations are the same for audio files, images, videos, documents, and other supported file types. For file prompting guidance, check out the File prompt guide section. Upload a file You can use the Files API to upload a media file. Always use the Files API when the total request size (including the files, text prompt, system instructions, etc.) is larger than 20 MB. The following code uploads a file and then uses the file in a call to generateContent . Python from google import genai client = genai . Client () myfile = client . files . upload ( file = "path/to/sample.mp3" ) response = client . models . generate_content ( model = "gemini-2.0-flash" , contents = [ "Describe this audio clip" , myfile ] ) print ( response . text ) JavaScript import { GoogleGenAI , createUserContent , createPartFromUri , } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const myfile = await ai . files . upload ({ file : "path/to/sample.mp3" , config : { mimeType : "audio/mpeg" }, }); const response = await ai . models . generateContent ({ model : "gemini-2.0-flash" , contents : createUserContent ([ createPartFromUri ( myfile . uri , myfile . mimeType ), "Describe this audio clip" , ]), }); console . log ( response . text ); } await main (); Go file , err := client . UploadFileFromPath ( ctx , "path/to/sample.mp3" , nil ) if err != nil { log . Fatal ( err ) } defer client . DeleteFile ( ctx , file . Name ) model := client . GenerativeModel ( "gemini-2.0-flash" ) resp , err := model . GenerateContent ( ctx , genai . FileData { URI : file . URI }, genai . Text ( "Describe this audio clip" )) if err != nil { log . Fatal ( err ) } printResponse ( resp ) REST AUDIO_PATH = "path/to/sample.mp3" MIME_TYPE = $( file -b --mime-type " ${ AUDIO_PATH } " ) NUM_BYTES = $( wc -c < " ${ AUDIO_PATH } " ) DISPLAY_NAME = AUDIO tmp_header_file = upload-header.tmp # Initial resumable request defining metadata. # The upload url is in the response headers dump them to a file. curl " ${ BASE_URL } /upload/v1beta/files" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -D " ${ tmp_header_file } " \ -H "X-Goog-Upload-Protocol: resumable" \ -H "X-Goog-Upload-Command: start" \ -H "X-Goog-Upload-Header-Content-Length: ${ NUM_BYTES } " \ -H "X-Goog-Upload-Header-Content-Type: ${ MIME_TYPE } " \ -H "Content-Type: application/json" \ -d "{'file': {'display_name': ' ${ DISPLAY_NAME } '}}" 2 > /dev/null upload_url = $( grep -i "x-goog-upload-url: " " ${ tmp_header_file } " | cut -d " " -f2 | tr -d "\r" ) rm " ${ tmp_header_file } " # Upload the actual bytes. curl " ${ upload_url } " \ -H "Content-Length: ${ NUM_BYTES } " \ -H "X-Goog-Upload-Offset: 0" \ -H "X-Goog-Upload-Command: upload, finalize" \ --data-binary "@ ${ AUDIO_PATH } " 2 > /dev/null > file_info.json file_uri = $( jq ".file.uri" file_info.json ) echo file_uri = $file_uri # Now generate content using that file curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [{ "parts":[ {"text": "Describe this audio clip"}, {"file_data":{"mime_type": "${MIME_TYPE}", "file_uri": ' $file_uri '}}] }] }' 2 > /dev/null > response.json cat response.json echo jq ".candidates[].content.parts[].text" response.json Get metadata for a file You can verify that the API successfully stored the uploaded file and get its metadata by calling files.get . Python myfile = client . files . upload ( file = 'path/to/sample.mp3' ) file_name = myfile . name myfile = client . files . get ( name = file_name ) print ( myfile ) JavaScript const myfile = await ai . files . upload ({ file : "path/to/sample.mp3" , config : { mimeType : "audio/mpeg" }, }); const fileName = myfile . name ; const fetchedFile = await ai . files . get ({ name : fileName }); console . log ( fetchedFile ); Go file , err := client . UploadFileFromPath ( ctx , "path/to/sample.mp3" , nil ) if err != nil { log . Fatal ( err ) } gotFile , err := client . GetFile ( ctx , file . Name ) if err != nil { log . Fatal ( err ) } fmt . Println ( "Got file:" , gotFile . Name ) REST # file_info.json was created in the upload example name = $( jq ".file.name" file_info.json ) # Get the file of interest to check state curl https://generativelanguage.googleapis.com/v1beta/files/ $name \ -H "x-goog-api-key: $GEMINI_API_KEY " > file_info.json # Print some information about the file you got name = $( jq ".file.name" file_info.json ) echo name = $name file_uri = $( jq ".file.uri" file_info.json ) echo file_uri = $file_uri List uploaded files You can upload multiple files using the Files API. The following code gets a list of all the files uploaded: Python print ( 'My files:' ) for f in client . files . list (): print ( ' ' , f . name ) JavaScript const listResponse = await ai . files . list ({ config : { pageSize : 10 } }); for await ( const file of listResponse ) { console . log ( file . name ); } Go iter := client . ListFiles ( ctx ) for { ifile , err := iter . Next () if err == iterator . Done { break } if err != nil { log . Fatal ( err ) } fmt . Println ( ifile . Name ) } REST echo "My files: " curl "https://generativelanguage.googleapis.com/v1beta/files" \ -H "x-goog-api-key: $GEMINI_API_KEY " Delete uploaded files Files are automatically deleted after 48 hours. You can also manually delete an uploaded file: Python myfile = client . files . upload ( file = 'path/to/sample.mp3' ) client . files . delete ( name = myfile . name ) JavaScript const myfile = await ai . files . upload ({ file : "path/to/sample.mp3" , config : { mimeType : "audio/mpeg" }, }); const fileName = myfile . name ; await ai . files . delete ({ name : fileName }); Go file , err := client . UploadFileFromPath ( ctx , "path/to/sample.mp3" , nil ) if err != nil { log . Fatal ( err ) } client . DeleteFile ( ctx , file . Name ) REST curl --request "DELETE" https://generativelanguage.googleapis.com/v1beta/files/ $name \ -H "x-goog-api-key: $GEMINI_API_KEY " Usage info You can use the Files API to upload and interact with media files. The Files API lets you store up to 20 GB of files per project, with a per-file maximum size of 2 GB. Files are stored for 48 hours. During that time, you can use the API to get metadata about the files, but you can't download the files. The Files API is available at no cost in all regions where the Gemini API is available. File prompting strategies This section provides guidance and best practices for using media files with prompts for the Gemini API. Being able to use various types of data in your prompts gives you more flexibility in terms of what tasks you can tackle with the Gemini API. For example, you can send the model a photo of a delicious meal and ask it to write a short blog about the meal. Prompt Response Write a short, engaging blog post based on this picture. It should include a description of the meal in the photo and talk about my journey meal prepping. Meal prepping is a great way to save time and money, and it can also help you to eat healthier. This meal is a great example of a healthy and delicious meal that can be easily prepped ahead of time. This meal features brown rice, roasted vegetables, and chicken teriyaki. The brown rice is a healthy and complex carbohydrate that will give you sustained energy. The roasted vegetables are a great way to get your daily dose of vitamins and minerals, and the chicken teriyaki is a delicious and protein-rich option. This meal is also very easy to prepare. Simply cook the brown rice, roast the vegetables, and cook the chicken teriyaki. Once everything is cooked, divide it into meal prep containers and store them in the refrigerator. You can then grab a container and go on busy mornings! If you're looking for a healthy and delicious meal that can be easily prepped ahead of time, this meal is a great option. It's packed with nutrients and flavor, and it's sure to keep you feeling full and satisfied. Here's to healthy and delicious meal prepping! If you are having trouble getting the output you want from prompts that use media files, there are some strategies that can help you get the results you want. The following sections provide design approaches and troubleshooting tips for improving prompts that use multimodal input. You can improve your multimodal prompts by following these best practices: Prompt design fundamentals Be specific in your instructions : Craft clear and concise instructions that leave minimal room for misinterpretation. Add a few examples to your prompt: Use realistic few-shot examples to illustrate what you want to achieve. Break it down step-by-step : Divide complex tasks into manageable sub-goals, guiding the model through the process. Specify the output format : In your prompt, ask for the output to be in the format you want, like markdown, JSON, HTML and more. Put your image first for single-image prompts : While Gemini can handle image and text inputs in any order, for prompts containing a single image, it might perform better if that image (or video) is placed before the text prompt. However, for prompts that require images to be highly interleaved with texts to make sense, use whatever order is most natural. Troubleshooting your multimodal prompt If the model is not drawing information from the relevant part of the image: Drop hints with which aspects of the image you want the prompt to draw information from. If the model output is too generic (not tailored enough to the image/video input): At the start of the prompt, try asking the model to describe the image(s) or video before providing the task instruction, or try asking the model to refer to what's in the image. To troubleshoot which part failed: Ask the model to describe the image, or ask the model to explain its reasoning, to gauge the model's initial understanding. If your prompt results in hallucinated content: Try dialing down the temperature setting or asking the model for shorter descriptions so that it's less likely to extrapolate additional details. Tuning the sampling parameters: Experiment with different temperature settings and top-k selections to adjust the model's creativity. Be specific in your instructions Prompts have the most success when they are clear and detailed. If you have a specific output in mind, it's better to include that requirement in the prompt to ensure you get the output you want. For this image of an airport board, asking the model to just "describe this image" could generate a general description. If you need the model to parse the time and city from the image, you can include that request directly in your prompt. Prompt Model response Describe this image. The image shows an airport arrivals and departures board. Updated prompt Improved response Parse the time and city from the airport board shown in this image into a list. 10:50 Moscow 11:05 Edinburgh 11:05 London 11:10 Bucharest 11:30 Kiev 11:35 Dublin 11:45 East Midlands 12:15 Sofia 12:30 London 12:30 Newcastle 12:40 St Petersburg 12:40 London 12:45 Manchester Add a few examples The Gemini model can accept multiple inputs which it can use as examples to understand the output you want. Adding these examples can help the model identify the patterns and apply the relationship between the given images and responses to the new example. This is also called "few-shot" learning. In the example below, the initial output is written in sentence form, and also contains the country (Brazil). Suppose you want the output in a different format or style, and you want only the city, not the country. Adding few-shot examples to your prompt can steer the model to respond in the way you want. Prompt Model response Determine the city along with the landmark. The landmark is the Christ the Redeemer statue in Rio de Janeiro, Brazil. Updated prompt Improved response Determine the city along with the landmark. city: Rome, landmark: the Colosseum. city: Beijing, landmark: Forbidden City city: Rio de Janeiro, landmark: Christ the Redeemer statue Break it down step-by-step For complex tasks like the ones that require both visual understanding and reasoning, it can be helpful to split the task into smaller, more straightforward steps. Alternatively, it could also be effective if you directly ask the model to “think step by step” in your prompt. Prompt Model response When will I run out of toilet paper? Soon, you only have 3 rolls left. Updated prompt Improved response 1. First, count how many toilet paper rolls are in this picture. 2. Then, determine how much toilet paper a typical person uses per day. 3. Calculate how long these rolls of toilet paper will last. 1. There are 3 rolls of toilet paper in this picture. 2. A typical person uses about 20 sheets of toilet paper per day. 3. If each roll contains 200 sheets, then each roll will last for about 10 days. Therefore, the 3 rolls will last for about a month. Math problems or other types of word problems are great candidates for asking the model to think step-by-step. Prompt Response What is the 4th term in the sequence? -135 The response from the model is incorrect. Some ways to improve this is to ask the model to parse the problem before answering the prompt. Prompt Response Parse the formula in the image first. Then based on the formula, answer "what's the 4th term in the sequence?". Click to expand the result $$ b(n) = b(n-1) * (-3) $$ $$ b(1) = 15 $$ Substitute n = 4 into the formula: $$ b(4) = b(4-1) * (-3) $$ $$ b(4) = b(3) * (-3) $$ We don't know the value of b(3), so substitute n = 3 into the formula: $$ b(3) = b(3-1) * (-3) $$ $$ b(3) = b(2) * (-3) $$ We don't know the value of b(2), so substitute n = 2 into the formula: $$ b(2) = b(2-1) * (-3) $$ $$ b(2) = b(1) * (-3) $$ We know the value of b(1) is 15, so substitute it into the formula: $$ b(2) = 15 * (-3) $$ $$ b(2) = -45 $$ Now that we know the value of b(2), substitute it into the formula for b(3): $$ b(3) = b(2) * (-3) $$ $$ b(3) = -45 * (-3) $$ $$ b(3) = 135 $$ Now that we know the value of b(3), substitute it into the formula for b(4): $$ b(4) = b(3) * (-3) $$ $$ b(4) = 135 * (-3) $$ $$ b(4) = -405 $$ Therefore, the 4th term in the sequence is -405. This model response is correct - now to make the response even more streamlined, you can ask the model to to think step-by-step. Updated prompt Improved response What is the 4th term in the sequence? Think step by step. The first term is 15. The second term is b(1) * (-3) = 15 * (-3) = -45. The third term is b(2) * (-3) = -45 * (-3) = 135. The fourth term is b(3) * (-3) = 135 * (-3) = -405. Specify the output format A common problem is the need for model output to be in a certain format (e.g. markdown, JSON, HTML), especially if the model output needs to be ingested by a downstream task. You can try to produce output in that format by instructing the model to do so within the prompt. Below are two examples: Prompt Model response Parse the table in this image into markdown format | Subject | Status | Time | Start date | Due on | | ------- | ------ | ---- | ---------- | ------ | | Math | Done | 1.5 hrs | 11/29/2023 | 12/1/2023 | | Geography | Done | 30 min | 12/2/2023 | 12/4/2023 | | Physics | In progress | 2 hrs | 12/3/2023 | 12/5/2023 | | Biology | Not started | 6 hrs | 11/2/2016 | 12/6/2023 | Prompt Model response Provide a list of all the following attributes: ingredients, type of cuisine, vegetarian or not, in JSON format ```json { "ingredients": [ "rice", "shrimp", "clams", "mussels", "peas", "tomatoes", "onions", "garlic", "olive oil", "paprika", "salt", "pepper" ], "type of cuisine": "Spanish", "vegetarian": false } ``` Put your image first for single-image prompts While Gemini can interpret image and text in any order within a prompt, placing a single image before the text prompt might lead to better results. In the following samples, you'll notice the image comes first before the text in each prompt. Troubleshooting your multimodal prompt You might need to troubleshoot your prompt if you are not getting a helpful response. Here are a few strategies you could try. If the model is not drawing information from the relevant part of the image To get a more specific response, you can point out which aspects of the image you want the prompt to draw information from to inform its response. Prompt Model response How many days will these diapers last a baby? The diapers will last for 198 days before they run out. Updated prompt Improved response How long will these diapers last before I run out? Use the weight shown on the box to determine the child's age, and use the total number of diapers in the box. Divide the total number by how many diapers the child goes through per day. The box contains 198 diapers. The box states the diapers are for a child weighing 22-37 lbs. The average child in this weight range is likely between 1 and 2 years old. A 2-year-old typically goes through 6 diapers per day, therefore, the diapers will last around 33 days (198/6). If the model output is too generic and not tailored enough to the image input To help the model tailor its response to the image(s), try asking it to describe the images before performing its reasoning task. Prompt Model response What is in common between these images? All four images are examples of landscape photography. Updated prompt Improved response First, describe what's in each image in detail. What's in common between these images? The first image shows a person packing a suitcase. The second image shows an empty hallway in a shopping mall. The third image shows a roll of toilet paper. The fourth image shows a dark and stormy sea. The commonality between these images is that they are all examples of liminal spaces. Liminal spaces are places or situations that are in between two other places or situations. They are often characterized by a sense of uncertainty or ambiguity. The first image shows the person in between being at home and being on a trip. The second image shows the hallway in between two stores. The third image is of toilet paper, which can represent a bathroom. A bathroom is a transitional space between private and public realms. It's a place where we prepare ourselves to enter or leave the outside world. The fourth image shows the sea in between being calm and being stormy. Liminal spaces can be both exciting and unsettling. They can be a place of new beginning or a place of endings. They can be a place of reflection or a place of action. You can also try explicitly asking the model to refer to what's in the image. Updated prompt Improved response What is in common between these images? Refer to what's in the images in your response. All of the images are related to travel. The first image shows a person packing a suitcase, the second image shows an empty hallway in a shopping mall, the third image shows a stack of toilet paper, and the fourth image shows a dark and stormy sea. These images are all related to travel because they are all things that people need or use when they travel. The first image shows a person packing a suitcase, which is something that people do when they are going on a trip. The second image shows an empty hallway in a shopping mall, which is a place where people often go to buy things for their trips. The third image shows a stack of toilet paper, which is something that people often use when they are traveling. The fourth image shows a dark and stormy sea, which is something that people often encounter when they are traveling. The same tips work for creative output from images. In the following example with the rabbit, explicitly asking the model to include aspects from the image can improve the output. Prompt Model response Can you write me a poem about this image? Little bunny, so soft and brown, Hopping through the field, up and down. Your ears so long, your tail so small, You're the cutest creature of them all. You nibble on the grass, so green and sweet, And hop away when you see me. But I'll always be your friend, Little bunny, until the very end. Updated prompt Improved response Can you write me a descriptive and dramatic poem about this image and include the location? Little bunny on the green, Ears so long, so keen. What do you see out there, In the distance, where you stare? Is it the ocean blue, Sparkling in the sun's hue? Or the waves that crash and roar, A rhythm that you adore? Little bunny, tell me true, What captures your attention so? Is it the salty breeze, Or the seagulls' cries that please? Perhaps it's the vastness of it all, A world so big and small. A place where dreams can roam, And adventures can be sown. Little bunny, I wonder what you dream, As you sit on the grass, so serene. Do you yearn to explore the deep, Or stay on land, where you can leap? Whatever it may be, little bunny, Keep that spark of wonder burning brightly. For in your dreams and aspirations, Lies a world waiting for your creation. Troubleshooting which part of the prompt failed It can be hard to know whether a prompt failed because the model didn't understand the image to begin with, or if it did understand the image but did not perform the correct reasoning steps afterward. To disambiguate between those reasons, ask the model to describe what's in the image. In the following example, if the model responds with a snack that seems surprising when paired with tea (e.g. popcorn), you can first troubleshoot to determine whether the model correctly recognized that the image contains tea. Prompt Prompt for troubleshooting What's a snack I can make in 1 minute that would go well with this? Describe what's in this image. Another strategy is to ask the model to explain its reasoning. That can help you narrow down which part of the reasoning broke down, if any. Prompt Prompt for troubleshooting What's a snack I can make in 1 minute that would go well with this? What's a snack I can make in 1 minute that would go well with this? Please explain why. What's next Try writing your own multimodal prompts using Google AI Studio . For information on using the Gemini Files API for uploading media files and including them in your prompts, see the Vision , Audio , and Document processing guides. For more guidance on prompt design, like tuning sampling parameters, see the Prompt strategies page. Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-06-27 UTC. \ No newline at end of file diff --git a/text_content/docs_quickstart_264912f5.txt b/text_content/docs_quickstart_264912f5.txt new file mode 100644 index 0000000000000000000000000000000000000000..f99e8fb8d1fe1ad6b02a98e8026ab79184a4df5f --- /dev/null +++ b/text_content/docs_quickstart_264912f5.txt @@ -0,0 +1,5 @@ +URL: https://ai.google.dev/gemini-api/docs/quickstart +Title: Gemini API quickstart  |  Google AI for Developers +================================================== + +Gemini API quickstart | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Gemini API quickstart This quickstart shows you how to install our libraries and make your first Gemini API request. Before you begin You need a Gemini API key. If you don't already have one, you can get it for free in Google AI Studio . Install the Google GenAI SDK Python Using Python 3.9+ , install the google-genai package using the following pip command : pip install -q -U google-genai JavaScript Using Node.js v18+ , install the Google Gen AI SDK for TypeScript and JavaScript using the following npm command : npm install @google/genai Go Install google.golang.org/genai in your module directory using the go get command : go get google.golang.org/genai Java If you're using Maven, you can install google-genai by adding the following to your dependencies: com.google.genai google-genai 1.0.0 Apps Script To create a new Apps Script project, go to script.new . Click Untitled project . Rename the Apps Script project AI Studio and click Rename . Set your API key At the left, click Project Settings . Under Script Properties click Add script property . For Property , enter the key name: GEMINI_API_KEY . For Value , enter the value for the API key. Click Save script properties . Replace the Code.gs file contents with the following code: Make your first request Here is an example that uses the generateContent method to send a request to the Gemini API using the Gemini 2.5 Flash model. If you set your API key as the environment variable GEMINI_API_KEY , it will be picked up automatically by the client when using the Gemini API libraries . Otherwise you will need to pass your API key as an argument when initializing the client. Note that all code samples in the Gemini API docs assume that you have set the environment variable GEMINI_API_KEY . Python from google import genai # The client gets the API key from the environment variable `GEMINI_API_KEY`. client = genai . Client () response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = "Explain how AI works in a few words" ) print ( response . text ) JavaScript import { GoogleGenAI } from "@google/genai" ; // The client gets the API key from the environment variable `GEMINI_API_KEY`. const ai = new GoogleGenAI ({}); async function main () { const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : "Explain how AI works in a few words" , }); console . log ( response . text ); } main (); Go package main import ( "context" "fmt" "log" "google.golang.org/genai" ) func main () { ctx := context . Background () // The client gets the API key from the environment variable `GEMINI_API_KEY`. client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } result , err := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , genai . Text ( "Explain how AI works in a few words" ), nil , ) if err != nil { log . Fatal ( err ) } fmt . Println ( result . Text ()) } Java package com.example ; import com.google.genai.Client ; import com.google.genai.types.GenerateContentResponse ; public class GenerateTextFromTextInput { public static void main ( String [] args ) { // The client gets the API key from the environment variable `GEMINI_API_KEY`. Client client = new Client (); GenerateContentResponse response = client . models . generateContent ( "gemini-2.5-flash" , "Explain how AI works in a few words" , null ); System . out . println ( response . text ()); } } Apps Script // See https://developers.google.com/apps-script/guides/properties // for instructions on how to set the API key. const apiKey = PropertiesService . getScriptProperties (). getProperty ( 'GEMINI_API_KEY' ); function main () { const payload = { contents : [ { parts : [ { text : 'Explain how AI works in a few words' }, ], }, ], }; const url = 'https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent' ; const options = { method : 'POST' , contentType : 'application/json' , headers : { 'x-goog-api-key' : apiKey , }, payload : JSON . stringify ( payload ) }; const response = UrlFetchApp . fetch ( url , options ); const data = JSON . parse ( response ); const content = data [ 'candidates' ][ 0 ][ 'content' ][ 'parts' ][ 0 ][ 'text' ]; console . log ( content ); } REST curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [ { "parts": [ { "text": "Explain how AI works in a few words" } ] } ] }' "Thinking" is on by default on many of our code samples Many code samples on this site use the Gemini 2.5 Flash model, which has the "thinking" feature enabled by default to enhance response quality. You should be aware that this may increase response time and token usage. If you prioritize speed or wish to minimize costs, you can disable this feature by setting the thinking budget to zero, as shown in the examples below. For more details, see the thinking guide . Note: Thinking is only available on Gemini 2.5 series models and can't be disabled on Gemini 2.5 Pro. Python from google import genai from google.genai import types client = genai . Client () response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = "Explain how AI works in a few words" , config = types . GenerateContentConfig ( thinking_config = types . ThinkingConfig ( thinking_budget = 0 ) # Disables thinking ), ) print ( response . text ) JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : "Explain how AI works in a few words" , config : { thinkingConfig : { thinkingBudget : 0 , // Disables thinking }, } }); console . log ( response . text ); } await main (); Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , genai . Text ( "Explain how AI works in a few words" ), & genai . GenerateContentConfig { ThinkingConfig : & genai . ThinkingConfig { ThinkingBudget : int32 ( 0 ), // Disables thinking }, } ) fmt . Println ( result . Text ()) } REST curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [ { "parts": [ { "text": "Explain how AI works in a few words" } ] } ] "generationConfig": { "thinkingConfig": { "thinkingBudget": 0 } } }' Apps Script // See https://developers.google.com/apps-script/guides/properties // for instructions on how to set the API key. const apiKey = PropertiesService . getScriptProperties (). getProperty ( 'GEMINI_API_KEY' ); function main () { const payload = { contents : [ { parts : [ { text : 'Explain how AI works in a few words' }, ], }, ], }; const url = 'https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent' ; const options = { method : 'POST' , contentType : 'application/json' , headers : { 'x-goog-api-key' : apiKey , }, payload : JSON . stringify ( payload ) }; const response = UrlFetchApp . fetch ( url , options ); const data = JSON . parse ( response ); const content = data [ 'candidates' ][ 0 ][ 'content' ][ 'parts' ][ 0 ][ 'text' ]; console . log ( content ); } What's next Now that you made your first API request, you might want to explore the following guides that show Gemini in action: Thinking Text generation Vision Long context Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-06-27 UTC. \ No newline at end of file diff --git a/text_content/docs_safety-guidance_44c74d3a.txt b/text_content/docs_safety-guidance_44c74d3a.txt new file mode 100644 index 0000000000000000000000000000000000000000..0c96fa80101dff35b89965693216d5b229312672 --- /dev/null +++ b/text_content/docs_safety-guidance_44c74d3a.txt @@ -0,0 +1,5 @@ +URL: https://ai.google.dev/gemini-api/docs/safety-guidance +Title: Safety guidance  |  Gemini API  |  Google AI for Developers +================================================== + +Safety guidance | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Safety guidance Generative artificial intelligence models are powerful tools, but they are not without their limitations. Their versatility and applicability can sometimes lead to unexpected outputs, such as outputs that are inaccurate, biased, or offensive. Post-processing, and rigorous manual evaluation are essential to limit the risk of harm from such outputs. The models provided by the Gemini API can be used for a wide variety of generative AI and natural language processing (NLP) applications. Use of these functions is only available through the Gemini API or the Google AI Studio web app. Your use of Gemini API is also subject to the Generative AI Prohibited Use Policy and the Gemini API terms of service . Part of what makes large language models (LLMs) so useful is that they're creative tools that can address many different language tasks. Unfortunately, this also means that large language models can generate output that you don't expect, including text that's offensive, insensitive, or factually incorrect. What's more, the incredible versatility of these models is also what makes it difficult to predict exactly what kinds of undesirable output they might produce. While the Gemini API has been designed with Google's AI principles in mind, the onus is on developers to apply these models responsibly. To aid developers in creating safe, responsible applications, the Gemini API has some built-in content filtering as well as adjustable safety settings across 4 dimensions of harm. Refer to the safety settings guide to learn more. This document is meant to introduce you to some safety risks that can arise when using LLMs, and recommend emerging safety design and development recommendations. (Note that laws and regulations may also impose restrictions, but such considerations are beyond the scope of this guide.) The following steps are recommended when building applications with LLMs: Understanding the safety risks of your application Considering adjustments to mitigate safety risks Performing safety testing appropriate to your use case Soliciting feedback from users and monitoring usage The adjustment and testing phases should be iterative until you reach performance appropriate for your application. Understand the safety risks of your application In this context, safety is being defined as the ability of an LLM to avoid causing harm to its users, for example, by generating toxic language or content that promotes stereotypes. The models available through the Gemini API have been designed with Google’s AI principles in mind and your use of it is subject to the Generative AI Prohibited Use Policy . The API provides built-in safety filters to help address some common language model problems such as toxic language and hate speech, and striving for inclusiveness and avoidance of stereotypes. However, each application can pose a different set of risks to its users. So as the application owner, you are responsible for knowing your users and the potential harms your application may cause, and ensuring that your application uses LLMs safely and responsibly. As part of this assessment, you should consider the likelihood that harm could occur and determine its seriousness and mitigation steps. For example, an app that generates essays based on factual events would need to be more careful about avoiding misinformation, as compared to an app that generates fictional stories for entertainment. A good way to begin exploring potential safety risks is to research your end users, and others who might be affected by your application's results. This can take many forms including researching state of the art studies in your app domain, observing how people are using similar apps, or running a user study, survey, or conducting informal interviews with potential users. Advanced tips Speak with a diverse mix of prospective users within your target population about your application and its intended purpose so as to get a wider perspective on potential risks and to adjust diversity criteria as needed. The AI Risk Management Framework released by the U.S. government's National Institute of Standards and Technology (NIST) provides more detailed guidance and additional learning resources for AI risk management. DeepMind's publication on the ethical and social risks of harm from language models describes in detail the ways that language model applications can cause harm. Consider adjustments to mitigate safety risks Now that you have an understanding of the risks, you can decide how to mitigate them. Determining which risks to prioritize and how much you should do to try to prevent them is a critical decision, similar to triaging bugs in a software project. Once you've determined priorities, you can start thinking about the types of mitigations that would be most appropriate. Often simple changes can make a difference and reduce risks. For example, when designing an application consider: Tuning the model output to better reflect what is acceptable in your application context. Tuning can make the output of the model more predictable and consistent and therefore can help mitigate certain risks. Providing an input method that facilities safer outputs. The exact input you give to an LLM can make a difference in the quality of the output. Experimenting with input prompts to find what works most safely in your use-case is well worth the effort, as you can then provide a UX that facilitates it. For example, you could restrict users to choose only from a drop-down list of input prompts, or offer pop-up suggestions with descriptive phrases which you've found perform safely in your application context. Blocking unsafe inputs and filtering output before it is shown to the user. In simple situations, blocklists can be used to identify and block unsafe words or phrases in prompts or responses, or require human reviewers to manually alter or block such content. Note: Automatically blocking based on a static list can have unintended results such as targeting a particular group that commonly uses vocabulary in the blocklist. Using trained classifiers to label each prompt with potential harms or adversarial signals. Different strategies can then be employed on how to handle the request based on the type of harm detected. For example, If the input is overtly adversarial or abusive in nature, it could be blocked and instead output a pre-scripted response. Advanced tip If signals determine the output to be harmful, the application can employ the following options: Provide an error message or pre-scripted output. Try the prompt again, in case an alternative safe output is generated, since sometimes the same prompt will elicit different outputs. Putting safeguards in place against deliberate misuse such as assigning each user a unique ID and imposing a limit on the volume of user queries that can be submitted in a given period. Another safeguard is to try and protect against possible prompt injection. Prompt injection, much like SQL injection, is a way for malicious users to design an input prompt that manipulates the output of the model, for example, by sending an input prompt that instructs the model to ignore any previous examples. See the Generative AI Prohibited Use Policy for details about deliberate misuse. Adjusting functionality to something that is inherently lower risk. Tasks that are narrower in scope (e.g., extracting keywords from passages of text) or that have greater human oversight (e.g., generating short-form content that will be reviewed by a human), often pose a lower risk. So for instance, instead of creating an application to write an email reply from scratch, you might instead limit it to expanding on an outline or suggesting alternative phrasings. Perform safety testing appropriate to your use case Testing is a key part of building robust and safe applications, but the extent, scope and strategies for testing will vary. For example, a just-for-fun haiku generator is likely to pose less severe risks than, say, an application designed for use by law firms to summarize legal documents and help draft contracts. But the haiku generator may be used by a wider variety of users which means the potential for adversarial attempts or even unintended harmful inputs can be greater. The implementation context also matters. For instance, an application with outputs that are reviewed by human experts prior to any action being taken might be deemed less likely to produce harmful outputs than the identical application without such oversight. It's not uncommon to go through several iterations of making changes and testing before feeling confident that you're ready to launch, even for applications that are relatively low risk. Two kinds of testing are particularly useful for AI applications: Safety benchmarking involves designing safety metrics that reflect the ways your application could be unsafe in the context of how it is likely to get used, then testing how well your application performs on the metrics using evaluation datasets. It's good practice to think about the minimum acceptable levels of safety metrics before testing so that 1) you can evaluate the test results against those expectations and 2) you can gather the evaluation dataset based on the tests that evaluate the metrics you care about most. Advanced tips Beware of over-relying on “off the shelf” approaches as it's likely you'll need to build your own testing datasets using human raters to fully suit your application's context. If you have more than one metric you'll need to decide how you'll trade off if a change leads to improvements for one metric to the detriment of another. Like with other performance engineering, you may want to focus on worst-case performance across your evaluation set rather than average performance. Adversarial testing involves proactively trying to break your application. The goal is to identify points of weakness so that you can take steps to remedy them as appropriate. Adversarial testing can take significant time/effort from evaluators with expertise in your application — but the more you do, the greater your chance of spotting problems, especially those occurring rarely or only after repeated runs of the application. Adversarial testing is a method for systematically evaluating an ML model with the intent of learning how it behaves when provided with malicious or inadvertently harmful input: An input may be malicious when the input is clearly designed to produce an unsafe or harmful output-- for example, asking a text generation model to generate a hateful rant about a particular religion. An input is inadvertently harmful when the input itself may be innocuous, but produces harmful output -- for example, asking a text generation model to describe a person of a particular ethnicity and receiving a racist output. What distinguishes an adversarial test from a standard evaluation is the composition of the data used for testing. For adversarial tests, select test data that is most likely to elicit problematic output from the model. This means probing the model's behavior for all the types of harms that are possible, including rare or unusual examples and edge-cases that are relevant to safety policies. It should also include diversity in the different dimensions of a sentence such as structure, meaning and length. You can refer to the Google's Responsible AI practices in fairness for more details on what to consider when building a test dataset. Advanced tips Use automated testing instead of the traditional method of enlisting people in 'red teams' to try and break your application. In automated testing, the 'red team' is another language model that finds input text that elicit harmful outputs from the model being tested. Note: LLMs are known to sometimes produce different outputs for the same input prompt. Multiple rounds of testing may be needed to catch more of the problematic outputs. Monitor for problems No matter how much you test and mitigate, you can never guarantee perfection, so plan upfront how you'll spot and deal with problems that arise. Common approaches include setting up a monitored channel for users to share feedback (e.g., thumbs up/down rating) and running a user study to proactively solicit feedback from a diverse mix of users — especially valuable if usage patterns are different to expectations. Advanced tips When users give feedback to AI products, it can greatly improve the AI performance and the user experience over time by, for example, helping you choose better examples for prompt tuning. The Feedback and Control chapter in Google's People and AI guidebook highlights key considerations to take into account when designing feedback mechanisms. Next steps Refer to the safety settings guide to learn about the adjustable safety settings available through the Gemini API. See the intro to prompting to get started writing your first prompts. Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-02-25 UTC. \ No newline at end of file diff --git a/text_content/docs_safety-settings_0772edb3.txt b/text_content/docs_safety-settings_0772edb3.txt new file mode 100644 index 0000000000000000000000000000000000000000..47925c2d061f2ed1bfc5976c5f4df20f45813789 --- /dev/null +++ b/text_content/docs_safety-settings_0772edb3.txt @@ -0,0 +1,5 @@ +URL: https://ai.google.dev/gemini-api/docs/safety-settings#safety-filtering-per-request +Title: Safety settings  |  Gemini API  |  Google AI for Developers +================================================== + +Safety settings | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Safety settings The Gemini API provides safety settings that you can adjust during the prototyping stage to determine if your application requires more or less restrictive safety configuration. You can adjust these settings across five filter categories to restrict or allow certain types of content. This guide covers how the Gemini API handles safety settings and filtering and how you can change the safety settings for your application. Note: Applications that use less restrictive safety settings may be subject to review. See the Terms of Service for more information. Safety filters The Gemini API's adjustable safety filters cover the following categories: Category Description Harassment Negative or harmful comments targeting identity and/or protected attributes. Hate speech Content that is rude, disrespectful, or profane. Sexually explicit Contains references to sexual acts or other lewd content. Dangerous Promotes, facilitates, or encourages harmful acts. Civic integrity Election-related queries. These categories are defined in HarmCategory . The Gemini models only support HARM_CATEGORY_HARASSMENT , HARM_CATEGORY_HATE_SPEECH , HARM_CATEGORY_SEXUALLY_EXPLICIT , HARM_CATEGORY_DANGEROUS_CONTENT , and HARM_CATEGORY_CIVIC_INTEGRITY . All other categories are used only by PaLM 2 (Legacy) models. You can use these filters to adjust what's appropriate for your use case. For example, if you're building video game dialogue, you may deem it acceptable to allow more content that's rated as Dangerous due to the nature of the game. In addition to the adjustable safety filters, the Gemini API has built-in protections against core harms, such as content that endangers child safety. These types of harm are always blocked and cannot be adjusted. Content safety filtering level The Gemini API categorizes the probability level of content being unsafe as HIGH , MEDIUM , LOW , or NEGLIGIBLE . The Gemini API blocks content based on the probability of content being unsafe and not the severity. This is important to consider because some content can have low probability of being unsafe even though the severity of harm could still be high. For example, comparing the sentences: The robot punched me. The robot slashed me up. The first sentence might result in a higher probability of being unsafe, but you might consider the second sentence to be a higher severity in terms of violence. Given this, it is important that you carefully test and consider what the appropriate level of blocking is needed to support your key use cases while minimizing harm to end users. Safety filtering per request You can adjust the safety settings for each request you make to the API. When you make a request, the content is analyzed and assigned a safety rating. The safety rating includes the category and the probability of the harm classification. For example, if the content was blocked due to the harassment category having a high probability, the safety rating returned would have category equal to HARASSMENT and harm probability set to HIGH . By default, safety settings block content (including prompts) with medium or higher probability of being unsafe across any filter. This baseline safety is designed to work for most use cases, so you should only adjust your safety settings if it's consistently required for your application. The following table describes the block settings you can adjust for each category. For example, if you set the block setting to Block few for the Hate speech category, everything that has a high probability of being hate speech content is blocked. But anything with a lower probability is allowed. Threshold (Google AI Studio) Threshold (API) Description Block none BLOCK_NONE Always show regardless of probability of unsafe content Block few BLOCK_ONLY_HIGH Block when high probability of unsafe content Block some BLOCK_MEDIUM_AND_ABOVE Block when medium or high probability of unsafe content Block most BLOCK_LOW_AND_ABOVE Block when low, medium or high probability of unsafe content N/A HARM_BLOCK_THRESHOLD_UNSPECIFIED Threshold is unspecified, block using default threshold If the threshold is not set, the default block threshold is Block none (for gemini-1.5-pro-002 and gemini-1.5-flash-002 and all newer stable GA models) or Block some (in all other models) for all categories except the Civic integrity category. The default block threshold for the Civic integrity category is Block none (for gemini-2.0-flash-001 aliased as gemini-2.0-flash , gemini-2.0-pro-exp-02-05 , and gemini-2.0-flash-lite ) both for Google AI Studio and the Gemini API, and Block most for all other models in Google AI Studio only. You can set these settings for each request you make to the generative service. See the HarmBlockThreshold API reference for details. Safety feedback generateContent returns a GenerateContentResponse which includes safety feedback. Prompt feedback is included in promptFeedback . If promptFeedback.blockReason is set, then the content of the prompt was blocked. Response candidate feedback is included in Candidate.finishReason and Candidate.safetyRatings . If response content was blocked and the finishReason was SAFETY , you can inspect safetyRatings for more details. The content that was blocked is not returned. Adjust safety settings This section covers how to adjust the safety settings in both Google AI Studio and in your code. Google AI Studio You can adjust safety settings in Google AI Studio, but you cannot turn them off. Click Edit safety settings in the Run settings panel to open the Run safety settings modal. In the modal, you can use the sliders to adjust the content filtering level per safety category: Note: If you set any of the category filters to Block none , Google AI Studio will display a reminder about the Gemini API's Terms of Service with respect to safety settings. When you send a request (for example, by asking the model a question), a warning No Content message appears if the request's content is blocked. To see more details, hold the pointer over the No Content text and click warning Safety . Gemini API SDKs The following code snippet shows how to set safety settings in your GenerateContent call. This sets the thresholds for the harassment ( HARM_CATEGORY_HARASSMENT ) and hate speech ( HARM_CATEGORY_HATE_SPEECH ) categories. For example, setting these categories to BLOCK_LOW_AND_ABOVE blocks any content that has a low or higher probability of being harassment or hate speech. To understand the threshold settings, see Safety filtering per request . Python from google import genai from google.genai import types import PIL.Image img = PIL . Image . open ( "cookies.jpg" ) client = genai . Client () response = client . models . generate_content ( model = "gemini-2.0-flash" , contents = [ 'Do these look store-bought or homemade?' , img ], config = types . GenerateContentConfig ( safety_settings = [ types . SafetySetting ( category = types . HarmCategory . HARM_CATEGORY_HATE_SPEECH , threshold = types . HarmBlockThreshold . BLOCK_LOW_AND_ABOVE , ), ] ) ) print ( response . text ) Go package main import ( "context" "fmt" "log" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } config := & genai . GenerateContentConfig { SafetySettings : [] * genai . SafetySetting { { Category : "HARM_CATEGORY_HATE_SPEECH" , Threshold : "BLOCK_LOW_AND_ABOVE" , }, }, } response , err := client . Models . GenerateContent ( ctx , "gemini-2.0-flash" , genai . Text ( "Some potentially unsafe prompt." ), config , ) if err != nil { log . Fatal ( err ) } fmt . Println ( response . Text ()) } JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({}); const safetySettings = [ { category : "HARM_CATEGORY_HARASSMENT" , threshold : "BLOCK_LOW_AND_ABOVE" , }, { category : "HARM_CATEGORY_HATE_SPEECH" , threshold : "BLOCK_LOW_AND_ABOVE" , }, ]; async function main () { const response = await ai . models . generateContent ({ model : "gemini-2.0-flash" , contents : "Some potentially unsafe prompt." , config : { safetySettings : safetySettings , }, }); console . log ( response . text ); } await main (); Dart (Flutter) final safetySettings = [ SafetySetting ( HarmCategory . harassment , HarmBlockThreshold . low ), SafetySetting ( HarmCategory . hateSpeech , HarmBlockThreshold . low ), ]; final model = GenerativeModel ( model: 'gemini-1.5-flash' , apiKey: apiKey , safetySettings: safetySettings , ); Kotlin val harassmentSafety = SafetySetting ( HarmCategory . HARASSMENT , BlockThreshold . LOW_AND_ABOVE ) val hateSpeechSafety = SafetySetting ( HarmCategory . HATE_SPEECH , BlockThreshold . LOW_AND_ABOVE ) val generativeModel = GenerativeModel ( modelName = "gemini-1.5-flash" , apiKey = BuildConfig . apiKey , safetySettings = listOf ( harassmentSafety , hateSpeechSafety ) ) Java SafetySetting harassmentSafety = new SafetySetting ( HarmCategory . HARASSMENT , BlockThreshold . LOW_AND_ABOVE ); SafetySetting hateSpeechSafety = new SafetySetting ( HarmCategory . HATE_SPEECH , BlockThreshold . LOW_AND_ABOVE ); GenerativeModel gm = new GenerativeModel ( "gemini-1.5-flash" , BuildConfig . apiKey , null , // generation config is optional Arrays . asList ( harassmentSafety , hateSpeechSafety ) ); GenerativeModelFutures model = GenerativeModelFutures . from ( gm ); REST echo '{ "safetySettings": [ {"category": "HARM_CATEGORY_HARASSMENT", "threshold": "BLOCK_ONLY_HIGH"}, {"category": "HARM_CATEGORY_HATE_SPEECH", "threshold": "BLOCK_MEDIUM_AND_ABOVE"} ], "contents": [{ "parts":[{ "text": "' I support Martians Soccer Club and I think Jupiterians Football Club sucks! Write a ironic phrase about them. '"}]}]}' > request.json curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d @request.json 2 > /dev/null Next steps See the API reference to learn more about the full API. Review the safety guidance for a general look at safety considerations when developing with LLMs. Learn more about assessing probability versus severity from the Jigsaw team Learn more about the products that contribute to safety solutions like the Perspective API . * You can use these safety settings to create a toxicity classifier. See the classification example to get started. Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-06-27 UTC. \ No newline at end of file diff --git a/text_content/docs_safety-settings_feb7ceb2.txt b/text_content/docs_safety-settings_feb7ceb2.txt new file mode 100644 index 0000000000000000000000000000000000000000..c75f1f1e4a340238a034d3e78291020f04f4d12f --- /dev/null +++ b/text_content/docs_safety-settings_feb7ceb2.txt @@ -0,0 +1,5 @@ +URL: https://ai.google.dev/gemini-api/docs/safety-settings#main-content +Title: Safety settings  |  Gemini API  |  Google AI for Developers +================================================== + +Safety settings | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Safety settings The Gemini API provides safety settings that you can adjust during the prototyping stage to determine if your application requires more or less restrictive safety configuration. You can adjust these settings across five filter categories to restrict or allow certain types of content. This guide covers how the Gemini API handles safety settings and filtering and how you can change the safety settings for your application. Note: Applications that use less restrictive safety settings may be subject to review. See the Terms of Service for more information. Safety filters The Gemini API's adjustable safety filters cover the following categories: Category Description Harassment Negative or harmful comments targeting identity and/or protected attributes. Hate speech Content that is rude, disrespectful, or profane. Sexually explicit Contains references to sexual acts or other lewd content. Dangerous Promotes, facilitates, or encourages harmful acts. Civic integrity Election-related queries. These categories are defined in HarmCategory . The Gemini models only support HARM_CATEGORY_HARASSMENT , HARM_CATEGORY_HATE_SPEECH , HARM_CATEGORY_SEXUALLY_EXPLICIT , HARM_CATEGORY_DANGEROUS_CONTENT , and HARM_CATEGORY_CIVIC_INTEGRITY . All other categories are used only by PaLM 2 (Legacy) models. You can use these filters to adjust what's appropriate for your use case. For example, if you're building video game dialogue, you may deem it acceptable to allow more content that's rated as Dangerous due to the nature of the game. In addition to the adjustable safety filters, the Gemini API has built-in protections against core harms, such as content that endangers child safety. These types of harm are always blocked and cannot be adjusted. Content safety filtering level The Gemini API categorizes the probability level of content being unsafe as HIGH , MEDIUM , LOW , or NEGLIGIBLE . The Gemini API blocks content based on the probability of content being unsafe and not the severity. This is important to consider because some content can have low probability of being unsafe even though the severity of harm could still be high. For example, comparing the sentences: The robot punched me. The robot slashed me up. The first sentence might result in a higher probability of being unsafe, but you might consider the second sentence to be a higher severity in terms of violence. Given this, it is important that you carefully test and consider what the appropriate level of blocking is needed to support your key use cases while minimizing harm to end users. Safety filtering per request You can adjust the safety settings for each request you make to the API. When you make a request, the content is analyzed and assigned a safety rating. The safety rating includes the category and the probability of the harm classification. For example, if the content was blocked due to the harassment category having a high probability, the safety rating returned would have category equal to HARASSMENT and harm probability set to HIGH . By default, safety settings block content (including prompts) with medium or higher probability of being unsafe across any filter. This baseline safety is designed to work for most use cases, so you should only adjust your safety settings if it's consistently required for your application. The following table describes the block settings you can adjust for each category. For example, if you set the block setting to Block few for the Hate speech category, everything that has a high probability of being hate speech content is blocked. But anything with a lower probability is allowed. Threshold (Google AI Studio) Threshold (API) Description Block none BLOCK_NONE Always show regardless of probability of unsafe content Block few BLOCK_ONLY_HIGH Block when high probability of unsafe content Block some BLOCK_MEDIUM_AND_ABOVE Block when medium or high probability of unsafe content Block most BLOCK_LOW_AND_ABOVE Block when low, medium or high probability of unsafe content N/A HARM_BLOCK_THRESHOLD_UNSPECIFIED Threshold is unspecified, block using default threshold If the threshold is not set, the default block threshold is Block none (for gemini-1.5-pro-002 and gemini-1.5-flash-002 and all newer stable GA models) or Block some (in all other models) for all categories except the Civic integrity category. The default block threshold for the Civic integrity category is Block none (for gemini-2.0-flash-001 aliased as gemini-2.0-flash , gemini-2.0-pro-exp-02-05 , and gemini-2.0-flash-lite ) both for Google AI Studio and the Gemini API, and Block most for all other models in Google AI Studio only. You can set these settings for each request you make to the generative service. See the HarmBlockThreshold API reference for details. Safety feedback generateContent returns a GenerateContentResponse which includes safety feedback. Prompt feedback is included in promptFeedback . If promptFeedback.blockReason is set, then the content of the prompt was blocked. Response candidate feedback is included in Candidate.finishReason and Candidate.safetyRatings . If response content was blocked and the finishReason was SAFETY , you can inspect safetyRatings for more details. The content that was blocked is not returned. Adjust safety settings This section covers how to adjust the safety settings in both Google AI Studio and in your code. Google AI Studio You can adjust safety settings in Google AI Studio, but you cannot turn them off. Click Edit safety settings in the Run settings panel to open the Run safety settings modal. In the modal, you can use the sliders to adjust the content filtering level per safety category: Note: If you set any of the category filters to Block none , Google AI Studio will display a reminder about the Gemini API's Terms of Service with respect to safety settings. When you send a request (for example, by asking the model a question), a warning No Content message appears if the request's content is blocked. To see more details, hold the pointer over the No Content text and click warning Safety . Gemini API SDKs The following code snippet shows how to set safety settings in your GenerateContent call. This sets the thresholds for the harassment ( HARM_CATEGORY_HARASSMENT ) and hate speech ( HARM_CATEGORY_HATE_SPEECH ) categories. For example, setting these categories to BLOCK_LOW_AND_ABOVE blocks any content that has a low or higher probability of being harassment or hate speech. To understand the threshold settings, see Safety filtering per request . Python from google import genai from google.genai import types import PIL.Image img = PIL . Image . open ( "cookies.jpg" ) client = genai . Client () response = client . models . generate_content ( model = "gemini-2.0-flash" , contents = [ 'Do these look store-bought or homemade?' , img ], config = types . GenerateContentConfig ( safety_settings = [ types . SafetySetting ( category = types . HarmCategory . HARM_CATEGORY_HATE_SPEECH , threshold = types . HarmBlockThreshold . BLOCK_LOW_AND_ABOVE , ), ] ) ) print ( response . text ) Go package main import ( "context" "fmt" "log" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } config := & genai . GenerateContentConfig { SafetySettings : [] * genai . SafetySetting { { Category : "HARM_CATEGORY_HATE_SPEECH" , Threshold : "BLOCK_LOW_AND_ABOVE" , }, }, } response , err := client . Models . GenerateContent ( ctx , "gemini-2.0-flash" , genai . Text ( "Some potentially unsafe prompt." ), config , ) if err != nil { log . Fatal ( err ) } fmt . Println ( response . Text ()) } JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({}); const safetySettings = [ { category : "HARM_CATEGORY_HARASSMENT" , threshold : "BLOCK_LOW_AND_ABOVE" , }, { category : "HARM_CATEGORY_HATE_SPEECH" , threshold : "BLOCK_LOW_AND_ABOVE" , }, ]; async function main () { const response = await ai . models . generateContent ({ model : "gemini-2.0-flash" , contents : "Some potentially unsafe prompt." , config : { safetySettings : safetySettings , }, }); console . log ( response . text ); } await main (); Dart (Flutter) final safetySettings = [ SafetySetting ( HarmCategory . harassment , HarmBlockThreshold . low ), SafetySetting ( HarmCategory . hateSpeech , HarmBlockThreshold . low ), ]; final model = GenerativeModel ( model: 'gemini-1.5-flash' , apiKey: apiKey , safetySettings: safetySettings , ); Kotlin val harassmentSafety = SafetySetting ( HarmCategory . HARASSMENT , BlockThreshold . LOW_AND_ABOVE ) val hateSpeechSafety = SafetySetting ( HarmCategory . HATE_SPEECH , BlockThreshold . LOW_AND_ABOVE ) val generativeModel = GenerativeModel ( modelName = "gemini-1.5-flash" , apiKey = BuildConfig . apiKey , safetySettings = listOf ( harassmentSafety , hateSpeechSafety ) ) Java SafetySetting harassmentSafety = new SafetySetting ( HarmCategory . HARASSMENT , BlockThreshold . LOW_AND_ABOVE ); SafetySetting hateSpeechSafety = new SafetySetting ( HarmCategory . HATE_SPEECH , BlockThreshold . LOW_AND_ABOVE ); GenerativeModel gm = new GenerativeModel ( "gemini-1.5-flash" , BuildConfig . apiKey , null , // generation config is optional Arrays . asList ( harassmentSafety , hateSpeechSafety ) ); GenerativeModelFutures model = GenerativeModelFutures . from ( gm ); REST echo '{ "safetySettings": [ {"category": "HARM_CATEGORY_HARASSMENT", "threshold": "BLOCK_ONLY_HIGH"}, {"category": "HARM_CATEGORY_HATE_SPEECH", "threshold": "BLOCK_MEDIUM_AND_ABOVE"} ], "contents": [{ "parts":[{ "text": "' I support Martians Soccer Club and I think Jupiterians Football Club sucks! Write a ironic phrase about them. '"}]}]}' > request.json curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d @request.json 2 > /dev/null Next steps See the API reference to learn more about the full API. Review the safety guidance for a general look at safety considerations when developing with LLMs. Learn more about assessing probability versus severity from the Jigsaw team Learn more about the products that contribute to safety solutions like the Perspective API . * You can use these safety settings to create a toxicity classifier. See the classification example to get started. Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-06-27 UTC. \ No newline at end of file diff --git a/text_content/docs_speech-generation_33f2a7c4.txt b/text_content/docs_speech-generation_33f2a7c4.txt new file mode 100644 index 0000000000000000000000000000000000000000..b14d3fffb51dbc72e1a9981090e8d50b501dbf90 --- /dev/null +++ b/text_content/docs_speech-generation_33f2a7c4.txt @@ -0,0 +1,5 @@ +URL: https://ai.google.dev/gemini-api/docs/speech-generation#languages +Title: Speech generation (text-to-speech)  |  Gemini API  |  Google AI for Developers +================================================== + +Speech generation (text-to-speech) | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Speech generation (text-to-speech) The Gemini API can transform text input into single speaker or multi-speaker audio using native text-to-speech (TTS) generation capabilities. Text-to-speech (TTS) generation is controllable , meaning you can use natural language to structure interactions and guide the style , accent , pace , and tone of the audio. The TTS capability differs from speech generation provided through the Live API , which is designed for interactive, unstructured audio, and multimodal inputs and outputs. While the Live API excels in dynamic conversational contexts, TTS through the Gemini API is tailored for scenarios that require exact text recitation with fine-grained control over style and sound, such as podcast or audiobook generation. This guide shows you how to generate single-speaker and multi-speaker audio from text. Preview: Native text-to-speech (TTS) is in Preview . Before you begin Ensure you use a Gemini 2.5 model variant with native text-to-speech (TTS) capabilities, as listed in the Supported models section. For optimal results, consider which model best fits your specific use case. You may find it useful to test the Gemini 2.5 TTS models in AI Studio before you start building. Note: TTS models accept text-only inputs and produce audio-only outputs. For a complete list of restrictions specific to TTS models, review the Limitations section. Single-speaker text-to-speech To convert text to single-speaker audio, set the response modality to "audio", and pass a SpeechConfig object with VoiceConfig set. You'll need to choose a voice name from the prebuilt output voices . This example saves the output audio from the model in a wave file: Python from google import genai from google.genai import types import wave # Set up the wave file to save the output: def wave_file ( filename , pcm , channels = 1 , rate = 24000 , sample_width = 2 ): with wave . open ( filename , "wb" ) as wf : wf . setnchannels ( channels ) wf . setsampwidth ( sample_width ) wf . setframerate ( rate ) wf . writeframes ( pcm ) client = genai . Client () response = client . models . generate_content ( model = "gemini-2.5-flash-preview-tts" , contents = "Say cheerfully: Have a wonderful day!" , config = types . GenerateContentConfig ( response_modalities = [ "AUDIO" ], speech_config = types . SpeechConfig ( voice_config = types . VoiceConfig ( prebuilt_voice_config = types . PrebuiltVoiceConfig ( voice_name = 'Kore' , ) ) ), ) ) data = response . candidates [ 0 ] . content . parts [ 0 ] . inline_data . data file_name = 'out.wav' wave_file ( file_name , data ) # Saves the file to current directory For more code samples, refer to the "TTS - Get Started" file in the cookbooks repository: View on GitHub JavaScript import { GoogleGenAI } from '@google/genai' ; import wav from 'wav' ; async function saveWaveFile ( filename , pcmData , channels = 1 , rate = 24000 , sampleWidth = 2 , ) { return new Promise (( resolve , reject ) = > { const writer = new wav . FileWriter ( filename , { channels , sampleRate : rate , bitDepth : sampleWidth * 8 , }); writer . on ( 'finish' , resolve ); writer . on ( 'error' , reject ); writer . write ( pcmData ); writer . end (); }); } async function main () { const ai = new GoogleGenAI ({}); const response = await ai . models . generateContent ({ model : "gemini-2.5-flash-preview-tts" , contents : [{ parts : [{ text : 'Say cheerfully: Have a wonderful day!' }] }], config : { responseModalities : [ 'AUDIO' ], speechConfig : { voiceConfig : { prebuiltVoiceConfig : { voiceName : 'Kore' }, }, }, }, }); const data = response . candidates ? .[ 0 ] ? . content ? . parts ? .[ 0 ] ? . inlineData ? . data ; const audioBuffer = Buffer . from ( data , 'base64' ); const fileName = 'out.wav' ; await saveWaveFile ( fileName , audioBuffer ); } await main (); REST curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash-preview-tts:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -X POST \ -H "Content-Type: application/json" \ -d '{ "contents": [{ "parts":[{ "text": "Say cheerfully: Have a wonderful day!" }] }], "generationConfig": { "responseModalities": ["AUDIO"], "speechConfig": { "voiceConfig": { "prebuiltVoiceConfig": { "voiceName": "Kore" } } } }, "model": "gemini-2.5-flash-preview-tts", }' | jq -r '.candidates[0].content.parts[0].inlineData.data' | \ base64 --decode >out.pcm # You may need to install ffmpeg. ffmpeg -f s16le -ar 24000 -ac 1 -i out.pcm out.wav Multi-speaker text-to-speech For multi-speaker audio, you'll need a MultiSpeakerVoiceConfig object with each speaker (up to 2) configured as a SpeakerVoiceConfig . You'll need to define each speaker with the same names used in the prompt : Python from google import genai from google.genai import types import wave # Set up the wave file to save the output: def wave_file ( filename , pcm , channels = 1 , rate = 24000 , sample_width = 2 ): with wave . open ( filename , "wb" ) as wf : wf . setnchannels ( channels ) wf . setsampwidth ( sample_width ) wf . setframerate ( rate ) wf . writeframes ( pcm ) client = genai . Client () prompt = """TTS the following conversation between Joe and Jane: Joe: How's it going today Jane? Jane: Not too bad, how about you?""" response = client . models . generate_content ( model = "gemini-2.5-flash-preview-tts" , contents = prompt , config = types . GenerateContentConfig ( response_modalities = [ "AUDIO" ], speech_config = types . SpeechConfig ( multi_speaker_voice_config = types . MultiSpeakerVoiceConfig ( speaker_voice_configs = [ types . SpeakerVoiceConfig ( speaker = 'Joe' , voice_config = types . VoiceConfig ( prebuilt_voice_config = types . PrebuiltVoiceConfig ( voice_name = 'Kore' , ) ) ), types . SpeakerVoiceConfig ( speaker = 'Jane' , voice_config = types . VoiceConfig ( prebuilt_voice_config = types . PrebuiltVoiceConfig ( voice_name = 'Puck' , ) ) ), ] ) ) ) ) data = response . candidates [ 0 ] . content . parts [ 0 ] . inline_data . data file_name = 'out.wav' wave_file ( file_name , data ) # Saves the file to current directory JavaScript import { GoogleGenAI } from '@google/genai' ; import wav from 'wav' ; async function saveWaveFile ( filename , pcmData , channels = 1 , rate = 24000 , sampleWidth = 2 , ) { return new Promise (( resolve , reject ) = > { const writer = new wav . FileWriter ( filename , { channels , sampleRate : rate , bitDepth : sampleWidth * 8 , }); writer . on ( 'finish' , resolve ); writer . on ( 'error' , reject ); writer . write ( pcmData ); writer . end (); }); } async function main () { const ai = new GoogleGenAI ({}); const prompt = `TTS the following conversation between Joe and Jane: Joe: How's it going today Jane? Jane: Not too bad, how about you?` ; const response = await ai . models . generateContent ({ model : "gemini-2.5-flash-preview-tts" , contents : [{ parts : [{ text : prompt }] }], config : { responseModalities : [ 'AUDIO' ], speechConfig : { multiSpeakerVoiceConfig : { speakerVoiceConfigs : [ { speaker : 'Joe' , voiceConfig : { prebuiltVoiceConfig : { voiceName : 'Kore' } } }, { speaker : 'Jane' , voiceConfig : { prebuiltVoiceConfig : { voiceName : 'Puck' } } } ] } } } }); const data = response . candidates ? .[ 0 ] ? . content ? . parts ? .[ 0 ] ? . inlineData ? . data ; const audioBuffer = Buffer . from ( data , 'base64' ); const fileName = 'out.wav' ; await saveWaveFile ( fileName , audioBuffer ); } await main (); REST curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash-preview-tts:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -X POST \ -H "Content-Type: application/json" \ -d '{ "contents": [{ "parts":[{ "text": "TTS the following conversation between Joe and Jane: Joe: Hows it going today Jane? Jane: Not too bad, how about you?" }] }], "generationConfig": { "responseModalities": ["AUDIO"], "speechConfig": { "multiSpeakerVoiceConfig": { "speakerVoiceConfigs": [{ "speaker": "Joe", "voiceConfig": { "prebuiltVoiceConfig": { "voiceName": "Kore" } } }, { "speaker": "Jane", "voiceConfig": { "prebuiltVoiceConfig": { "voiceName": "Puck" } } }] } } }, "model": "gemini-2.5-flash-preview-tts", }' | jq -r '.candidates[0].content.parts[0].inlineData.data' | \ base64 --decode > out.pcm # You may need to install ffmpeg. ffmpeg -f s16le -ar 24000 -ac 1 -i out.pcm out.wav Controlling speech style with prompts You can control style, tone, accent, and pace using natural language prompts for both single- and multi-speaker TTS. For example, in a single-speaker prompt, you can say: Say in an spooky whisper: "By the pricking of my thumbs... Something wicked this way comes" In a multi-speaker prompt, provide the model with each speaker's name and corresponding transcript. You can also provide guidance for each speaker individually: Make Speaker1 sound tired and bored, and Speaker2 sound excited and happy: Speaker1: So... what's on the agenda today? Speaker2: You're never going to guess! Try using a voice option that corresponds to the style or emotion you want to convey, to emphasize it even more. In the previous prompt, for example, Enceladus 's breathiness might emphasize "tired" and "bored", while Puck 's upbeat tone could complement "excited" and "happy". Generating a prompt to convert to audio The TTS models only output audio, but you can use other models to generate a transcript first, then pass that transcript to the TTS model to read aloud. Python from google import genai from google.genai import types client = genai . Client () transcript = client . models . generate_content ( model = "gemini-2.0-flash" , contents = """Generate a short transcript around 100 words that reads like it was clipped from a podcast by excited herpetologists. The hosts names are Dr. Anya and Liam.""" ) . text response = client . models . generate_content ( model = "gemini-2.5-flash-preview-tts" , contents = transcript , config = types . GenerateContentConfig ( response_modalities = [ "AUDIO" ], speech_config = types . SpeechConfig ( multi_speaker_voice_config = types . MultiSpeakerVoiceConfig ( speaker_voice_configs = [ types . SpeakerVoiceConfig ( speaker = 'Dr. Anya' , voice_config = types . VoiceConfig ( prebuilt_voice_config = types . PrebuiltVoiceConfig ( voice_name = 'Kore' , ) ) ), types . SpeakerVoiceConfig ( speaker = 'Liam' , voice_config = types . VoiceConfig ( prebuilt_voice_config = types . PrebuiltVoiceConfig ( voice_name = 'Puck' , ) ) ), ] ) ) ) ) # ...Code to stream or save the output JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const transcript = await ai . models . generateContent ({ model : "gemini-2.0-flash" , contents : "Generate a short transcript around 100 words that reads like it was clipped from a podcast by excited herpetologists. The hosts names are Dr. Anya and Liam." , }) const response = await ai . models . generateContent ({ model : "gemini-2.5-flash-preview-tts" , contents : transcript , config : { responseModalities : [ 'AUDIO' ], speechConfig : { multiSpeakerVoiceConfig : { speakerVoiceConfigs : [ { speaker : "Dr. Anya" , voiceConfig : { prebuiltVoiceConfig : { voiceName : "Kore" }, } }, { speaker : "Liam" , voiceConfig : { prebuiltVoiceConfig : { voiceName : "Puck" }, } } ] } } } }); } // ..JavaScript code for exporting .wav file for output audio await main (); Voice options TTS models support the following 30 voice options in the voice_name field: Zephyr -- Bright Puck -- Upbeat Charon -- Informative Kore -- Firm Fenrir -- Excitable Leda -- Youthful Orus -- Firm Aoede -- Breezy Callirrhoe -- Easy-going Autonoe -- Bright Enceladus -- Breathy Iapetus -- Clear Umbriel -- Easy-going Algieba -- Smooth Despina -- Smooth Erinome -- Clear Algenib -- Gravelly Rasalgethi -- Informative Laomedeia -- Upbeat Achernar -- Soft Alnilam -- Firm Schedar -- Even Gacrux -- Mature Pulcherrima -- Forward Achird -- Friendly Zubenelgenubi -- Casual Vindemiatrix -- Gentle Sadachbia -- Lively Sadaltager -- Knowledgeable Sulafat -- Warm You can hear all the voice options in AI Studio . Supported languages The TTS models detect the input language automatically. They support the following 24 languages: Language BCP-47 Code Language BCP-47 Code Arabic (Egyptian) ar-EG German (Germany) de-DE English (US) en-US Spanish (US) es-US French (France) fr-FR Hindi (India) hi-IN Indonesian (Indonesia) id-ID Italian (Italy) it-IT Japanese (Japan) ja-JP Korean (Korea) ko-KR Portuguese (Brazil) pt-BR Russian (Russia) ru-RU Dutch (Netherlands) nl-NL Polish (Poland) pl-PL Thai (Thailand) th-TH Turkish (Turkey) tr-TR Vietnamese (Vietnam) vi-VN Romanian (Romania) ro-RO Ukrainian (Ukraine) uk-UA Bengali (Bangladesh) bn-BD English (India) en-IN & hi-IN bundle Marathi (India) mr-IN Tamil (India) ta-IN Telugu (India) te-IN Supported models Model Single speaker Multispeaker Gemini 2.5 Flash Preview TTS ✔️ ✔️ Gemini 2.5 Pro Preview TTS ✔️ ✔️ Limitations TTS models can only receive text inputs and generate audio outputs. A TTS session has a context window limit of 32k tokens. Review Languages section for language support. What's next Try the audio generation cookbook . Gemini's Live API offers interactive audio generation options you can interleave with other modalities. For working with audio inputs , visit the Audio understanding guide. Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-08 UTC. \ No newline at end of file diff --git a/text_content/docs_speech-generation_84e2018c.txt b/text_content/docs_speech-generation_84e2018c.txt new file mode 100644 index 0000000000000000000000000000000000000000..40512af4f98a368298eb5c91a7a016c4aad4cae2 --- /dev/null +++ b/text_content/docs_speech-generation_84e2018c.txt @@ -0,0 +1,5 @@ +URL: https://ai.google.dev/gemini-api/docs/speech-generation#supported-models +Title: Speech generation (text-to-speech)  |  Gemini API  |  Google AI for Developers +================================================== + +Speech generation (text-to-speech) | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Speech generation (text-to-speech) The Gemini API can transform text input into single speaker or multi-speaker audio using native text-to-speech (TTS) generation capabilities. Text-to-speech (TTS) generation is controllable , meaning you can use natural language to structure interactions and guide the style , accent , pace , and tone of the audio. The TTS capability differs from speech generation provided through the Live API , which is designed for interactive, unstructured audio, and multimodal inputs and outputs. While the Live API excels in dynamic conversational contexts, TTS through the Gemini API is tailored for scenarios that require exact text recitation with fine-grained control over style and sound, such as podcast or audiobook generation. This guide shows you how to generate single-speaker and multi-speaker audio from text. Preview: Native text-to-speech (TTS) is in Preview . Before you begin Ensure you use a Gemini 2.5 model variant with native text-to-speech (TTS) capabilities, as listed in the Supported models section. For optimal results, consider which model best fits your specific use case. You may find it useful to test the Gemini 2.5 TTS models in AI Studio before you start building. Note: TTS models accept text-only inputs and produce audio-only outputs. For a complete list of restrictions specific to TTS models, review the Limitations section. Single-speaker text-to-speech To convert text to single-speaker audio, set the response modality to "audio", and pass a SpeechConfig object with VoiceConfig set. You'll need to choose a voice name from the prebuilt output voices . This example saves the output audio from the model in a wave file: Python from google import genai from google.genai import types import wave # Set up the wave file to save the output: def wave_file ( filename , pcm , channels = 1 , rate = 24000 , sample_width = 2 ): with wave . open ( filename , "wb" ) as wf : wf . setnchannels ( channels ) wf . setsampwidth ( sample_width ) wf . setframerate ( rate ) wf . writeframes ( pcm ) client = genai . Client () response = client . models . generate_content ( model = "gemini-2.5-flash-preview-tts" , contents = "Say cheerfully: Have a wonderful day!" , config = types . GenerateContentConfig ( response_modalities = [ "AUDIO" ], speech_config = types . SpeechConfig ( voice_config = types . VoiceConfig ( prebuilt_voice_config = types . PrebuiltVoiceConfig ( voice_name = 'Kore' , ) ) ), ) ) data = response . candidates [ 0 ] . content . parts [ 0 ] . inline_data . data file_name = 'out.wav' wave_file ( file_name , data ) # Saves the file to current directory For more code samples, refer to the "TTS - Get Started" file in the cookbooks repository: View on GitHub JavaScript import { GoogleGenAI } from '@google/genai' ; import wav from 'wav' ; async function saveWaveFile ( filename , pcmData , channels = 1 , rate = 24000 , sampleWidth = 2 , ) { return new Promise (( resolve , reject ) = > { const writer = new wav . FileWriter ( filename , { channels , sampleRate : rate , bitDepth : sampleWidth * 8 , }); writer . on ( 'finish' , resolve ); writer . on ( 'error' , reject ); writer . write ( pcmData ); writer . end (); }); } async function main () { const ai = new GoogleGenAI ({}); const response = await ai . models . generateContent ({ model : "gemini-2.5-flash-preview-tts" , contents : [{ parts : [{ text : 'Say cheerfully: Have a wonderful day!' }] }], config : { responseModalities : [ 'AUDIO' ], speechConfig : { voiceConfig : { prebuiltVoiceConfig : { voiceName : 'Kore' }, }, }, }, }); const data = response . candidates ? .[ 0 ] ? . content ? . parts ? .[ 0 ] ? . inlineData ? . data ; const audioBuffer = Buffer . from ( data , 'base64' ); const fileName = 'out.wav' ; await saveWaveFile ( fileName , audioBuffer ); } await main (); REST curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash-preview-tts:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -X POST \ -H "Content-Type: application/json" \ -d '{ "contents": [{ "parts":[{ "text": "Say cheerfully: Have a wonderful day!" }] }], "generationConfig": { "responseModalities": ["AUDIO"], "speechConfig": { "voiceConfig": { "prebuiltVoiceConfig": { "voiceName": "Kore" } } } }, "model": "gemini-2.5-flash-preview-tts", }' | jq -r '.candidates[0].content.parts[0].inlineData.data' | \ base64 --decode >out.pcm # You may need to install ffmpeg. ffmpeg -f s16le -ar 24000 -ac 1 -i out.pcm out.wav Multi-speaker text-to-speech For multi-speaker audio, you'll need a MultiSpeakerVoiceConfig object with each speaker (up to 2) configured as a SpeakerVoiceConfig . You'll need to define each speaker with the same names used in the prompt : Python from google import genai from google.genai import types import wave # Set up the wave file to save the output: def wave_file ( filename , pcm , channels = 1 , rate = 24000 , sample_width = 2 ): with wave . open ( filename , "wb" ) as wf : wf . setnchannels ( channels ) wf . setsampwidth ( sample_width ) wf . setframerate ( rate ) wf . writeframes ( pcm ) client = genai . Client () prompt = """TTS the following conversation between Joe and Jane: Joe: How's it going today Jane? Jane: Not too bad, how about you?""" response = client . models . generate_content ( model = "gemini-2.5-flash-preview-tts" , contents = prompt , config = types . GenerateContentConfig ( response_modalities = [ "AUDIO" ], speech_config = types . SpeechConfig ( multi_speaker_voice_config = types . MultiSpeakerVoiceConfig ( speaker_voice_configs = [ types . SpeakerVoiceConfig ( speaker = 'Joe' , voice_config = types . VoiceConfig ( prebuilt_voice_config = types . PrebuiltVoiceConfig ( voice_name = 'Kore' , ) ) ), types . SpeakerVoiceConfig ( speaker = 'Jane' , voice_config = types . VoiceConfig ( prebuilt_voice_config = types . PrebuiltVoiceConfig ( voice_name = 'Puck' , ) ) ), ] ) ) ) ) data = response . candidates [ 0 ] . content . parts [ 0 ] . inline_data . data file_name = 'out.wav' wave_file ( file_name , data ) # Saves the file to current directory JavaScript import { GoogleGenAI } from '@google/genai' ; import wav from 'wav' ; async function saveWaveFile ( filename , pcmData , channels = 1 , rate = 24000 , sampleWidth = 2 , ) { return new Promise (( resolve , reject ) = > { const writer = new wav . FileWriter ( filename , { channels , sampleRate : rate , bitDepth : sampleWidth * 8 , }); writer . on ( 'finish' , resolve ); writer . on ( 'error' , reject ); writer . write ( pcmData ); writer . end (); }); } async function main () { const ai = new GoogleGenAI ({}); const prompt = `TTS the following conversation between Joe and Jane: Joe: How's it going today Jane? Jane: Not too bad, how about you?` ; const response = await ai . models . generateContent ({ model : "gemini-2.5-flash-preview-tts" , contents : [{ parts : [{ text : prompt }] }], config : { responseModalities : [ 'AUDIO' ], speechConfig : { multiSpeakerVoiceConfig : { speakerVoiceConfigs : [ { speaker : 'Joe' , voiceConfig : { prebuiltVoiceConfig : { voiceName : 'Kore' } } }, { speaker : 'Jane' , voiceConfig : { prebuiltVoiceConfig : { voiceName : 'Puck' } } } ] } } } }); const data = response . candidates ? .[ 0 ] ? . content ? . parts ? .[ 0 ] ? . inlineData ? . data ; const audioBuffer = Buffer . from ( data , 'base64' ); const fileName = 'out.wav' ; await saveWaveFile ( fileName , audioBuffer ); } await main (); REST curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash-preview-tts:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -X POST \ -H "Content-Type: application/json" \ -d '{ "contents": [{ "parts":[{ "text": "TTS the following conversation between Joe and Jane: Joe: Hows it going today Jane? Jane: Not too bad, how about you?" }] }], "generationConfig": { "responseModalities": ["AUDIO"], "speechConfig": { "multiSpeakerVoiceConfig": { "speakerVoiceConfigs": [{ "speaker": "Joe", "voiceConfig": { "prebuiltVoiceConfig": { "voiceName": "Kore" } } }, { "speaker": "Jane", "voiceConfig": { "prebuiltVoiceConfig": { "voiceName": "Puck" } } }] } } }, "model": "gemini-2.5-flash-preview-tts", }' | jq -r '.candidates[0].content.parts[0].inlineData.data' | \ base64 --decode > out.pcm # You may need to install ffmpeg. ffmpeg -f s16le -ar 24000 -ac 1 -i out.pcm out.wav Controlling speech style with prompts You can control style, tone, accent, and pace using natural language prompts for both single- and multi-speaker TTS. For example, in a single-speaker prompt, you can say: Say in an spooky whisper: "By the pricking of my thumbs... Something wicked this way comes" In a multi-speaker prompt, provide the model with each speaker's name and corresponding transcript. You can also provide guidance for each speaker individually: Make Speaker1 sound tired and bored, and Speaker2 sound excited and happy: Speaker1: So... what's on the agenda today? Speaker2: You're never going to guess! Try using a voice option that corresponds to the style or emotion you want to convey, to emphasize it even more. In the previous prompt, for example, Enceladus 's breathiness might emphasize "tired" and "bored", while Puck 's upbeat tone could complement "excited" and "happy". Generating a prompt to convert to audio The TTS models only output audio, but you can use other models to generate a transcript first, then pass that transcript to the TTS model to read aloud. Python from google import genai from google.genai import types client = genai . Client () transcript = client . models . generate_content ( model = "gemini-2.0-flash" , contents = """Generate a short transcript around 100 words that reads like it was clipped from a podcast by excited herpetologists. The hosts names are Dr. Anya and Liam.""" ) . text response = client . models . generate_content ( model = "gemini-2.5-flash-preview-tts" , contents = transcript , config = types . GenerateContentConfig ( response_modalities = [ "AUDIO" ], speech_config = types . SpeechConfig ( multi_speaker_voice_config = types . MultiSpeakerVoiceConfig ( speaker_voice_configs = [ types . SpeakerVoiceConfig ( speaker = 'Dr. Anya' , voice_config = types . VoiceConfig ( prebuilt_voice_config = types . PrebuiltVoiceConfig ( voice_name = 'Kore' , ) ) ), types . SpeakerVoiceConfig ( speaker = 'Liam' , voice_config = types . VoiceConfig ( prebuilt_voice_config = types . PrebuiltVoiceConfig ( voice_name = 'Puck' , ) ) ), ] ) ) ) ) # ...Code to stream or save the output JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const transcript = await ai . models . generateContent ({ model : "gemini-2.0-flash" , contents : "Generate a short transcript around 100 words that reads like it was clipped from a podcast by excited herpetologists. The hosts names are Dr. Anya and Liam." , }) const response = await ai . models . generateContent ({ model : "gemini-2.5-flash-preview-tts" , contents : transcript , config : { responseModalities : [ 'AUDIO' ], speechConfig : { multiSpeakerVoiceConfig : { speakerVoiceConfigs : [ { speaker : "Dr. Anya" , voiceConfig : { prebuiltVoiceConfig : { voiceName : "Kore" }, } }, { speaker : "Liam" , voiceConfig : { prebuiltVoiceConfig : { voiceName : "Puck" }, } } ] } } } }); } // ..JavaScript code for exporting .wav file for output audio await main (); Voice options TTS models support the following 30 voice options in the voice_name field: Zephyr -- Bright Puck -- Upbeat Charon -- Informative Kore -- Firm Fenrir -- Excitable Leda -- Youthful Orus -- Firm Aoede -- Breezy Callirrhoe -- Easy-going Autonoe -- Bright Enceladus -- Breathy Iapetus -- Clear Umbriel -- Easy-going Algieba -- Smooth Despina -- Smooth Erinome -- Clear Algenib -- Gravelly Rasalgethi -- Informative Laomedeia -- Upbeat Achernar -- Soft Alnilam -- Firm Schedar -- Even Gacrux -- Mature Pulcherrima -- Forward Achird -- Friendly Zubenelgenubi -- Casual Vindemiatrix -- Gentle Sadachbia -- Lively Sadaltager -- Knowledgeable Sulafat -- Warm You can hear all the voice options in AI Studio . Supported languages The TTS models detect the input language automatically. They support the following 24 languages: Language BCP-47 Code Language BCP-47 Code Arabic (Egyptian) ar-EG German (Germany) de-DE English (US) en-US Spanish (US) es-US French (France) fr-FR Hindi (India) hi-IN Indonesian (Indonesia) id-ID Italian (Italy) it-IT Japanese (Japan) ja-JP Korean (Korea) ko-KR Portuguese (Brazil) pt-BR Russian (Russia) ru-RU Dutch (Netherlands) nl-NL Polish (Poland) pl-PL Thai (Thailand) th-TH Turkish (Turkey) tr-TR Vietnamese (Vietnam) vi-VN Romanian (Romania) ro-RO Ukrainian (Ukraine) uk-UA Bengali (Bangladesh) bn-BD English (India) en-IN & hi-IN bundle Marathi (India) mr-IN Tamil (India) ta-IN Telugu (India) te-IN Supported models Model Single speaker Multispeaker Gemini 2.5 Flash Preview TTS ✔️ ✔️ Gemini 2.5 Pro Preview TTS ✔️ ✔️ Limitations TTS models can only receive text inputs and generate audio outputs. A TTS session has a context window limit of 32k tokens. Review Languages section for language support. What's next Try the audio generation cookbook . Gemini's Live API offers interactive audio generation options you can interleave with other modalities. For working with audio inputs , visit the Audio understanding guide. Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-08 UTC. \ No newline at end of file diff --git a/text_content/docs_speech-generation_be4b3634.txt b/text_content/docs_speech-generation_be4b3634.txt new file mode 100644 index 0000000000000000000000000000000000000000..35404942dbc860d7b48a765c8b41796631f404dc --- /dev/null +++ b/text_content/docs_speech-generation_be4b3634.txt @@ -0,0 +1,5 @@ +URL: https://ai.google.dev/gemini-api/docs/speech-generation#limitations +Title: Speech generation (text-to-speech)  |  Gemini API  |  Google AI for Developers +================================================== + +Speech generation (text-to-speech) | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Speech generation (text-to-speech) The Gemini API can transform text input into single speaker or multi-speaker audio using native text-to-speech (TTS) generation capabilities. Text-to-speech (TTS) generation is controllable , meaning you can use natural language to structure interactions and guide the style , accent , pace , and tone of the audio. The TTS capability differs from speech generation provided through the Live API , which is designed for interactive, unstructured audio, and multimodal inputs and outputs. While the Live API excels in dynamic conversational contexts, TTS through the Gemini API is tailored for scenarios that require exact text recitation with fine-grained control over style and sound, such as podcast or audiobook generation. This guide shows you how to generate single-speaker and multi-speaker audio from text. Preview: Native text-to-speech (TTS) is in Preview . Before you begin Ensure you use a Gemini 2.5 model variant with native text-to-speech (TTS) capabilities, as listed in the Supported models section. For optimal results, consider which model best fits your specific use case. You may find it useful to test the Gemini 2.5 TTS models in AI Studio before you start building. Note: TTS models accept text-only inputs and produce audio-only outputs. For a complete list of restrictions specific to TTS models, review the Limitations section. Single-speaker text-to-speech To convert text to single-speaker audio, set the response modality to "audio", and pass a SpeechConfig object with VoiceConfig set. You'll need to choose a voice name from the prebuilt output voices . This example saves the output audio from the model in a wave file: Python from google import genai from google.genai import types import wave # Set up the wave file to save the output: def wave_file ( filename , pcm , channels = 1 , rate = 24000 , sample_width = 2 ): with wave . open ( filename , "wb" ) as wf : wf . setnchannels ( channels ) wf . setsampwidth ( sample_width ) wf . setframerate ( rate ) wf . writeframes ( pcm ) client = genai . Client () response = client . models . generate_content ( model = "gemini-2.5-flash-preview-tts" , contents = "Say cheerfully: Have a wonderful day!" , config = types . GenerateContentConfig ( response_modalities = [ "AUDIO" ], speech_config = types . SpeechConfig ( voice_config = types . VoiceConfig ( prebuilt_voice_config = types . PrebuiltVoiceConfig ( voice_name = 'Kore' , ) ) ), ) ) data = response . candidates [ 0 ] . content . parts [ 0 ] . inline_data . data file_name = 'out.wav' wave_file ( file_name , data ) # Saves the file to current directory For more code samples, refer to the "TTS - Get Started" file in the cookbooks repository: View on GitHub JavaScript import { GoogleGenAI } from '@google/genai' ; import wav from 'wav' ; async function saveWaveFile ( filename , pcmData , channels = 1 , rate = 24000 , sampleWidth = 2 , ) { return new Promise (( resolve , reject ) = > { const writer = new wav . FileWriter ( filename , { channels , sampleRate : rate , bitDepth : sampleWidth * 8 , }); writer . on ( 'finish' , resolve ); writer . on ( 'error' , reject ); writer . write ( pcmData ); writer . end (); }); } async function main () { const ai = new GoogleGenAI ({}); const response = await ai . models . generateContent ({ model : "gemini-2.5-flash-preview-tts" , contents : [{ parts : [{ text : 'Say cheerfully: Have a wonderful day!' }] }], config : { responseModalities : [ 'AUDIO' ], speechConfig : { voiceConfig : { prebuiltVoiceConfig : { voiceName : 'Kore' }, }, }, }, }); const data = response . candidates ? .[ 0 ] ? . content ? . parts ? .[ 0 ] ? . inlineData ? . data ; const audioBuffer = Buffer . from ( data , 'base64' ); const fileName = 'out.wav' ; await saveWaveFile ( fileName , audioBuffer ); } await main (); REST curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash-preview-tts:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -X POST \ -H "Content-Type: application/json" \ -d '{ "contents": [{ "parts":[{ "text": "Say cheerfully: Have a wonderful day!" }] }], "generationConfig": { "responseModalities": ["AUDIO"], "speechConfig": { "voiceConfig": { "prebuiltVoiceConfig": { "voiceName": "Kore" } } } }, "model": "gemini-2.5-flash-preview-tts", }' | jq -r '.candidates[0].content.parts[0].inlineData.data' | \ base64 --decode >out.pcm # You may need to install ffmpeg. ffmpeg -f s16le -ar 24000 -ac 1 -i out.pcm out.wav Multi-speaker text-to-speech For multi-speaker audio, you'll need a MultiSpeakerVoiceConfig object with each speaker (up to 2) configured as a SpeakerVoiceConfig . You'll need to define each speaker with the same names used in the prompt : Python from google import genai from google.genai import types import wave # Set up the wave file to save the output: def wave_file ( filename , pcm , channels = 1 , rate = 24000 , sample_width = 2 ): with wave . open ( filename , "wb" ) as wf : wf . setnchannels ( channels ) wf . setsampwidth ( sample_width ) wf . setframerate ( rate ) wf . writeframes ( pcm ) client = genai . Client () prompt = """TTS the following conversation between Joe and Jane: Joe: How's it going today Jane? Jane: Not too bad, how about you?""" response = client . models . generate_content ( model = "gemini-2.5-flash-preview-tts" , contents = prompt , config = types . GenerateContentConfig ( response_modalities = [ "AUDIO" ], speech_config = types . SpeechConfig ( multi_speaker_voice_config = types . MultiSpeakerVoiceConfig ( speaker_voice_configs = [ types . SpeakerVoiceConfig ( speaker = 'Joe' , voice_config = types . VoiceConfig ( prebuilt_voice_config = types . PrebuiltVoiceConfig ( voice_name = 'Kore' , ) ) ), types . SpeakerVoiceConfig ( speaker = 'Jane' , voice_config = types . VoiceConfig ( prebuilt_voice_config = types . PrebuiltVoiceConfig ( voice_name = 'Puck' , ) ) ), ] ) ) ) ) data = response . candidates [ 0 ] . content . parts [ 0 ] . inline_data . data file_name = 'out.wav' wave_file ( file_name , data ) # Saves the file to current directory JavaScript import { GoogleGenAI } from '@google/genai' ; import wav from 'wav' ; async function saveWaveFile ( filename , pcmData , channels = 1 , rate = 24000 , sampleWidth = 2 , ) { return new Promise (( resolve , reject ) = > { const writer = new wav . FileWriter ( filename , { channels , sampleRate : rate , bitDepth : sampleWidth * 8 , }); writer . on ( 'finish' , resolve ); writer . on ( 'error' , reject ); writer . write ( pcmData ); writer . end (); }); } async function main () { const ai = new GoogleGenAI ({}); const prompt = `TTS the following conversation between Joe and Jane: Joe: How's it going today Jane? Jane: Not too bad, how about you?` ; const response = await ai . models . generateContent ({ model : "gemini-2.5-flash-preview-tts" , contents : [{ parts : [{ text : prompt }] }], config : { responseModalities : [ 'AUDIO' ], speechConfig : { multiSpeakerVoiceConfig : { speakerVoiceConfigs : [ { speaker : 'Joe' , voiceConfig : { prebuiltVoiceConfig : { voiceName : 'Kore' } } }, { speaker : 'Jane' , voiceConfig : { prebuiltVoiceConfig : { voiceName : 'Puck' } } } ] } } } }); const data = response . candidates ? .[ 0 ] ? . content ? . parts ? .[ 0 ] ? . inlineData ? . data ; const audioBuffer = Buffer . from ( data , 'base64' ); const fileName = 'out.wav' ; await saveWaveFile ( fileName , audioBuffer ); } await main (); REST curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash-preview-tts:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -X POST \ -H "Content-Type: application/json" \ -d '{ "contents": [{ "parts":[{ "text": "TTS the following conversation between Joe and Jane: Joe: Hows it going today Jane? Jane: Not too bad, how about you?" }] }], "generationConfig": { "responseModalities": ["AUDIO"], "speechConfig": { "multiSpeakerVoiceConfig": { "speakerVoiceConfigs": [{ "speaker": "Joe", "voiceConfig": { "prebuiltVoiceConfig": { "voiceName": "Kore" } } }, { "speaker": "Jane", "voiceConfig": { "prebuiltVoiceConfig": { "voiceName": "Puck" } } }] } } }, "model": "gemini-2.5-flash-preview-tts", }' | jq -r '.candidates[0].content.parts[0].inlineData.data' | \ base64 --decode > out.pcm # You may need to install ffmpeg. ffmpeg -f s16le -ar 24000 -ac 1 -i out.pcm out.wav Controlling speech style with prompts You can control style, tone, accent, and pace using natural language prompts for both single- and multi-speaker TTS. For example, in a single-speaker prompt, you can say: Say in an spooky whisper: "By the pricking of my thumbs... Something wicked this way comes" In a multi-speaker prompt, provide the model with each speaker's name and corresponding transcript. You can also provide guidance for each speaker individually: Make Speaker1 sound tired and bored, and Speaker2 sound excited and happy: Speaker1: So... what's on the agenda today? Speaker2: You're never going to guess! Try using a voice option that corresponds to the style or emotion you want to convey, to emphasize it even more. In the previous prompt, for example, Enceladus 's breathiness might emphasize "tired" and "bored", while Puck 's upbeat tone could complement "excited" and "happy". Generating a prompt to convert to audio The TTS models only output audio, but you can use other models to generate a transcript first, then pass that transcript to the TTS model to read aloud. Python from google import genai from google.genai import types client = genai . Client () transcript = client . models . generate_content ( model = "gemini-2.0-flash" , contents = """Generate a short transcript around 100 words that reads like it was clipped from a podcast by excited herpetologists. The hosts names are Dr. Anya and Liam.""" ) . text response = client . models . generate_content ( model = "gemini-2.5-flash-preview-tts" , contents = transcript , config = types . GenerateContentConfig ( response_modalities = [ "AUDIO" ], speech_config = types . SpeechConfig ( multi_speaker_voice_config = types . MultiSpeakerVoiceConfig ( speaker_voice_configs = [ types . SpeakerVoiceConfig ( speaker = 'Dr. Anya' , voice_config = types . VoiceConfig ( prebuilt_voice_config = types . PrebuiltVoiceConfig ( voice_name = 'Kore' , ) ) ), types . SpeakerVoiceConfig ( speaker = 'Liam' , voice_config = types . VoiceConfig ( prebuilt_voice_config = types . PrebuiltVoiceConfig ( voice_name = 'Puck' , ) ) ), ] ) ) ) ) # ...Code to stream or save the output JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const transcript = await ai . models . generateContent ({ model : "gemini-2.0-flash" , contents : "Generate a short transcript around 100 words that reads like it was clipped from a podcast by excited herpetologists. The hosts names are Dr. Anya and Liam." , }) const response = await ai . models . generateContent ({ model : "gemini-2.5-flash-preview-tts" , contents : transcript , config : { responseModalities : [ 'AUDIO' ], speechConfig : { multiSpeakerVoiceConfig : { speakerVoiceConfigs : [ { speaker : "Dr. Anya" , voiceConfig : { prebuiltVoiceConfig : { voiceName : "Kore" }, } }, { speaker : "Liam" , voiceConfig : { prebuiltVoiceConfig : { voiceName : "Puck" }, } } ] } } } }); } // ..JavaScript code for exporting .wav file for output audio await main (); Voice options TTS models support the following 30 voice options in the voice_name field: Zephyr -- Bright Puck -- Upbeat Charon -- Informative Kore -- Firm Fenrir -- Excitable Leda -- Youthful Orus -- Firm Aoede -- Breezy Callirrhoe -- Easy-going Autonoe -- Bright Enceladus -- Breathy Iapetus -- Clear Umbriel -- Easy-going Algieba -- Smooth Despina -- Smooth Erinome -- Clear Algenib -- Gravelly Rasalgethi -- Informative Laomedeia -- Upbeat Achernar -- Soft Alnilam -- Firm Schedar -- Even Gacrux -- Mature Pulcherrima -- Forward Achird -- Friendly Zubenelgenubi -- Casual Vindemiatrix -- Gentle Sadachbia -- Lively Sadaltager -- Knowledgeable Sulafat -- Warm You can hear all the voice options in AI Studio . Supported languages The TTS models detect the input language automatically. They support the following 24 languages: Language BCP-47 Code Language BCP-47 Code Arabic (Egyptian) ar-EG German (Germany) de-DE English (US) en-US Spanish (US) es-US French (France) fr-FR Hindi (India) hi-IN Indonesian (Indonesia) id-ID Italian (Italy) it-IT Japanese (Japan) ja-JP Korean (Korea) ko-KR Portuguese (Brazil) pt-BR Russian (Russia) ru-RU Dutch (Netherlands) nl-NL Polish (Poland) pl-PL Thai (Thailand) th-TH Turkish (Turkey) tr-TR Vietnamese (Vietnam) vi-VN Romanian (Romania) ro-RO Ukrainian (Ukraine) uk-UA Bengali (Bangladesh) bn-BD English (India) en-IN & hi-IN bundle Marathi (India) mr-IN Tamil (India) ta-IN Telugu (India) te-IN Supported models Model Single speaker Multispeaker Gemini 2.5 Flash Preview TTS ✔️ ✔️ Gemini 2.5 Pro Preview TTS ✔️ ✔️ Limitations TTS models can only receive text inputs and generate audio outputs. A TTS session has a context window limit of 32k tokens. Review Languages section for language support. What's next Try the audio generation cookbook . Gemini's Live API offers interactive audio generation options you can interleave with other modalities. For working with audio inputs , visit the Audio understanding guide. Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-08 UTC. \ No newline at end of file diff --git a/text_content/docs_speech-generation_d9a6c71e.txt b/text_content/docs_speech-generation_d9a6c71e.txt new file mode 100644 index 0000000000000000000000000000000000000000..dd91e7275f05d16d96321ab9e5631168bcc2e435 --- /dev/null +++ b/text_content/docs_speech-generation_d9a6c71e.txt @@ -0,0 +1,5 @@ +URL: https://ai.google.dev/gemini-api/docs/speech-generation#voices +Title: Speech generation (text-to-speech)  |  Gemini API  |  Google AI for Developers +================================================== + +Speech generation (text-to-speech) | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Speech generation (text-to-speech) The Gemini API can transform text input into single speaker or multi-speaker audio using native text-to-speech (TTS) generation capabilities. Text-to-speech (TTS) generation is controllable , meaning you can use natural language to structure interactions and guide the style , accent , pace , and tone of the audio. The TTS capability differs from speech generation provided through the Live API , which is designed for interactive, unstructured audio, and multimodal inputs and outputs. While the Live API excels in dynamic conversational contexts, TTS through the Gemini API is tailored for scenarios that require exact text recitation with fine-grained control over style and sound, such as podcast or audiobook generation. This guide shows you how to generate single-speaker and multi-speaker audio from text. Preview: Native text-to-speech (TTS) is in Preview . Before you begin Ensure you use a Gemini 2.5 model variant with native text-to-speech (TTS) capabilities, as listed in the Supported models section. For optimal results, consider which model best fits your specific use case. You may find it useful to test the Gemini 2.5 TTS models in AI Studio before you start building. Note: TTS models accept text-only inputs and produce audio-only outputs. For a complete list of restrictions specific to TTS models, review the Limitations section. Single-speaker text-to-speech To convert text to single-speaker audio, set the response modality to "audio", and pass a SpeechConfig object with VoiceConfig set. You'll need to choose a voice name from the prebuilt output voices . This example saves the output audio from the model in a wave file: Python from google import genai from google.genai import types import wave # Set up the wave file to save the output: def wave_file ( filename , pcm , channels = 1 , rate = 24000 , sample_width = 2 ): with wave . open ( filename , "wb" ) as wf : wf . setnchannels ( channels ) wf . setsampwidth ( sample_width ) wf . setframerate ( rate ) wf . writeframes ( pcm ) client = genai . Client () response = client . models . generate_content ( model = "gemini-2.5-flash-preview-tts" , contents = "Say cheerfully: Have a wonderful day!" , config = types . GenerateContentConfig ( response_modalities = [ "AUDIO" ], speech_config = types . SpeechConfig ( voice_config = types . VoiceConfig ( prebuilt_voice_config = types . PrebuiltVoiceConfig ( voice_name = 'Kore' , ) ) ), ) ) data = response . candidates [ 0 ] . content . parts [ 0 ] . inline_data . data file_name = 'out.wav' wave_file ( file_name , data ) # Saves the file to current directory For more code samples, refer to the "TTS - Get Started" file in the cookbooks repository: View on GitHub JavaScript import { GoogleGenAI } from '@google/genai' ; import wav from 'wav' ; async function saveWaveFile ( filename , pcmData , channels = 1 , rate = 24000 , sampleWidth = 2 , ) { return new Promise (( resolve , reject ) = > { const writer = new wav . FileWriter ( filename , { channels , sampleRate : rate , bitDepth : sampleWidth * 8 , }); writer . on ( 'finish' , resolve ); writer . on ( 'error' , reject ); writer . write ( pcmData ); writer . end (); }); } async function main () { const ai = new GoogleGenAI ({}); const response = await ai . models . generateContent ({ model : "gemini-2.5-flash-preview-tts" , contents : [{ parts : [{ text : 'Say cheerfully: Have a wonderful day!' }] }], config : { responseModalities : [ 'AUDIO' ], speechConfig : { voiceConfig : { prebuiltVoiceConfig : { voiceName : 'Kore' }, }, }, }, }); const data = response . candidates ? .[ 0 ] ? . content ? . parts ? .[ 0 ] ? . inlineData ? . data ; const audioBuffer = Buffer . from ( data , 'base64' ); const fileName = 'out.wav' ; await saveWaveFile ( fileName , audioBuffer ); } await main (); REST curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash-preview-tts:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -X POST \ -H "Content-Type: application/json" \ -d '{ "contents": [{ "parts":[{ "text": "Say cheerfully: Have a wonderful day!" }] }], "generationConfig": { "responseModalities": ["AUDIO"], "speechConfig": { "voiceConfig": { "prebuiltVoiceConfig": { "voiceName": "Kore" } } } }, "model": "gemini-2.5-flash-preview-tts", }' | jq -r '.candidates[0].content.parts[0].inlineData.data' | \ base64 --decode >out.pcm # You may need to install ffmpeg. ffmpeg -f s16le -ar 24000 -ac 1 -i out.pcm out.wav Multi-speaker text-to-speech For multi-speaker audio, you'll need a MultiSpeakerVoiceConfig object with each speaker (up to 2) configured as a SpeakerVoiceConfig . You'll need to define each speaker with the same names used in the prompt : Python from google import genai from google.genai import types import wave # Set up the wave file to save the output: def wave_file ( filename , pcm , channels = 1 , rate = 24000 , sample_width = 2 ): with wave . open ( filename , "wb" ) as wf : wf . setnchannels ( channels ) wf . setsampwidth ( sample_width ) wf . setframerate ( rate ) wf . writeframes ( pcm ) client = genai . Client () prompt = """TTS the following conversation between Joe and Jane: Joe: How's it going today Jane? Jane: Not too bad, how about you?""" response = client . models . generate_content ( model = "gemini-2.5-flash-preview-tts" , contents = prompt , config = types . GenerateContentConfig ( response_modalities = [ "AUDIO" ], speech_config = types . SpeechConfig ( multi_speaker_voice_config = types . MultiSpeakerVoiceConfig ( speaker_voice_configs = [ types . SpeakerVoiceConfig ( speaker = 'Joe' , voice_config = types . VoiceConfig ( prebuilt_voice_config = types . PrebuiltVoiceConfig ( voice_name = 'Kore' , ) ) ), types . SpeakerVoiceConfig ( speaker = 'Jane' , voice_config = types . VoiceConfig ( prebuilt_voice_config = types . PrebuiltVoiceConfig ( voice_name = 'Puck' , ) ) ), ] ) ) ) ) data = response . candidates [ 0 ] . content . parts [ 0 ] . inline_data . data file_name = 'out.wav' wave_file ( file_name , data ) # Saves the file to current directory JavaScript import { GoogleGenAI } from '@google/genai' ; import wav from 'wav' ; async function saveWaveFile ( filename , pcmData , channels = 1 , rate = 24000 , sampleWidth = 2 , ) { return new Promise (( resolve , reject ) = > { const writer = new wav . FileWriter ( filename , { channels , sampleRate : rate , bitDepth : sampleWidth * 8 , }); writer . on ( 'finish' , resolve ); writer . on ( 'error' , reject ); writer . write ( pcmData ); writer . end (); }); } async function main () { const ai = new GoogleGenAI ({}); const prompt = `TTS the following conversation between Joe and Jane: Joe: How's it going today Jane? Jane: Not too bad, how about you?` ; const response = await ai . models . generateContent ({ model : "gemini-2.5-flash-preview-tts" , contents : [{ parts : [{ text : prompt }] }], config : { responseModalities : [ 'AUDIO' ], speechConfig : { multiSpeakerVoiceConfig : { speakerVoiceConfigs : [ { speaker : 'Joe' , voiceConfig : { prebuiltVoiceConfig : { voiceName : 'Kore' } } }, { speaker : 'Jane' , voiceConfig : { prebuiltVoiceConfig : { voiceName : 'Puck' } } } ] } } } }); const data = response . candidates ? .[ 0 ] ? . content ? . parts ? .[ 0 ] ? . inlineData ? . data ; const audioBuffer = Buffer . from ( data , 'base64' ); const fileName = 'out.wav' ; await saveWaveFile ( fileName , audioBuffer ); } await main (); REST curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash-preview-tts:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -X POST \ -H "Content-Type: application/json" \ -d '{ "contents": [{ "parts":[{ "text": "TTS the following conversation between Joe and Jane: Joe: Hows it going today Jane? Jane: Not too bad, how about you?" }] }], "generationConfig": { "responseModalities": ["AUDIO"], "speechConfig": { "multiSpeakerVoiceConfig": { "speakerVoiceConfigs": [{ "speaker": "Joe", "voiceConfig": { "prebuiltVoiceConfig": { "voiceName": "Kore" } } }, { "speaker": "Jane", "voiceConfig": { "prebuiltVoiceConfig": { "voiceName": "Puck" } } }] } } }, "model": "gemini-2.5-flash-preview-tts", }' | jq -r '.candidates[0].content.parts[0].inlineData.data' | \ base64 --decode > out.pcm # You may need to install ffmpeg. ffmpeg -f s16le -ar 24000 -ac 1 -i out.pcm out.wav Controlling speech style with prompts You can control style, tone, accent, and pace using natural language prompts for both single- and multi-speaker TTS. For example, in a single-speaker prompt, you can say: Say in an spooky whisper: "By the pricking of my thumbs... Something wicked this way comes" In a multi-speaker prompt, provide the model with each speaker's name and corresponding transcript. You can also provide guidance for each speaker individually: Make Speaker1 sound tired and bored, and Speaker2 sound excited and happy: Speaker1: So... what's on the agenda today? Speaker2: You're never going to guess! Try using a voice option that corresponds to the style or emotion you want to convey, to emphasize it even more. In the previous prompt, for example, Enceladus 's breathiness might emphasize "tired" and "bored", while Puck 's upbeat tone could complement "excited" and "happy". Generating a prompt to convert to audio The TTS models only output audio, but you can use other models to generate a transcript first, then pass that transcript to the TTS model to read aloud. Python from google import genai from google.genai import types client = genai . Client () transcript = client . models . generate_content ( model = "gemini-2.0-flash" , contents = """Generate a short transcript around 100 words that reads like it was clipped from a podcast by excited herpetologists. The hosts names are Dr. Anya and Liam.""" ) . text response = client . models . generate_content ( model = "gemini-2.5-flash-preview-tts" , contents = transcript , config = types . GenerateContentConfig ( response_modalities = [ "AUDIO" ], speech_config = types . SpeechConfig ( multi_speaker_voice_config = types . MultiSpeakerVoiceConfig ( speaker_voice_configs = [ types . SpeakerVoiceConfig ( speaker = 'Dr. Anya' , voice_config = types . VoiceConfig ( prebuilt_voice_config = types . PrebuiltVoiceConfig ( voice_name = 'Kore' , ) ) ), types . SpeakerVoiceConfig ( speaker = 'Liam' , voice_config = types . VoiceConfig ( prebuilt_voice_config = types . PrebuiltVoiceConfig ( voice_name = 'Puck' , ) ) ), ] ) ) ) ) # ...Code to stream or save the output JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const transcript = await ai . models . generateContent ({ model : "gemini-2.0-flash" , contents : "Generate a short transcript around 100 words that reads like it was clipped from a podcast by excited herpetologists. The hosts names are Dr. Anya and Liam." , }) const response = await ai . models . generateContent ({ model : "gemini-2.5-flash-preview-tts" , contents : transcript , config : { responseModalities : [ 'AUDIO' ], speechConfig : { multiSpeakerVoiceConfig : { speakerVoiceConfigs : [ { speaker : "Dr. Anya" , voiceConfig : { prebuiltVoiceConfig : { voiceName : "Kore" }, } }, { speaker : "Liam" , voiceConfig : { prebuiltVoiceConfig : { voiceName : "Puck" }, } } ] } } } }); } // ..JavaScript code for exporting .wav file for output audio await main (); Voice options TTS models support the following 30 voice options in the voice_name field: Zephyr -- Bright Puck -- Upbeat Charon -- Informative Kore -- Firm Fenrir -- Excitable Leda -- Youthful Orus -- Firm Aoede -- Breezy Callirrhoe -- Easy-going Autonoe -- Bright Enceladus -- Breathy Iapetus -- Clear Umbriel -- Easy-going Algieba -- Smooth Despina -- Smooth Erinome -- Clear Algenib -- Gravelly Rasalgethi -- Informative Laomedeia -- Upbeat Achernar -- Soft Alnilam -- Firm Schedar -- Even Gacrux -- Mature Pulcherrima -- Forward Achird -- Friendly Zubenelgenubi -- Casual Vindemiatrix -- Gentle Sadachbia -- Lively Sadaltager -- Knowledgeable Sulafat -- Warm You can hear all the voice options in AI Studio . Supported languages The TTS models detect the input language automatically. They support the following 24 languages: Language BCP-47 Code Language BCP-47 Code Arabic (Egyptian) ar-EG German (Germany) de-DE English (US) en-US Spanish (US) es-US French (France) fr-FR Hindi (India) hi-IN Indonesian (Indonesia) id-ID Italian (Italy) it-IT Japanese (Japan) ja-JP Korean (Korea) ko-KR Portuguese (Brazil) pt-BR Russian (Russia) ru-RU Dutch (Netherlands) nl-NL Polish (Poland) pl-PL Thai (Thailand) th-TH Turkish (Turkey) tr-TR Vietnamese (Vietnam) vi-VN Romanian (Romania) ro-RO Ukrainian (Ukraine) uk-UA Bengali (Bangladesh) bn-BD English (India) en-IN & hi-IN bundle Marathi (India) mr-IN Tamil (India) ta-IN Telugu (India) te-IN Supported models Model Single speaker Multispeaker Gemini 2.5 Flash Preview TTS ✔️ ✔️ Gemini 2.5 Pro Preview TTS ✔️ ✔️ Limitations TTS models can only receive text inputs and generate audio outputs. A TTS session has a context window limit of 32k tokens. Review Languages section for language support. What's next Try the audio generation cookbook . Gemini's Live API offers interactive audio generation options you can interleave with other modalities. For working with audio inputs , visit the Audio understanding guide. Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-08 UTC. \ No newline at end of file diff --git a/text_content/docs_structured-output_12d22815.txt b/text_content/docs_structured-output_12d22815.txt new file mode 100644 index 0000000000000000000000000000000000000000..c283209ee2fa5b8d0c34b676424d684d72f3bb45 --- /dev/null +++ b/text_content/docs_structured-output_12d22815.txt @@ -0,0 +1,5 @@ +URL: https://ai.google.dev/gemini-api/docs/structured-output +Title: Structured output  |  Gemini API  |  Google AI for Developers +================================================== + +Structured output | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Structured output You can configure Gemini for structured output instead of unstructured text, allowing precise extraction and standardization of information for further processing. For example, you can use structured output to extract information from resumes, standardize them to build a structured database. Gemini can generate either JSON or enum values as structured output. Generating JSON There are two ways to generate JSON using the Gemini API: Configure a schema on the model Provide a schema in a text prompt Configuring a schema on the model is the recommended way to generate JSON, because it constrains the model to output JSON. Configuring a schema (recommended) To constrain the model to generate JSON, configure a responseSchema . The model will then respond to any prompt with JSON-formatted output. Python from google import genai from pydantic import BaseModel class Recipe ( BaseModel ): recipe_name : str ingredients : list [ str ] client = genai . Client () response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = "List a few popular cookie recipes, and include the amounts of ingredients." , config = { "response_mime_type" : "application/json" , "response_schema" : list [ Recipe ], }, ) # Use the response as a JSON string. print ( response . text ) # Use instantiated objects. my_recipes : list [ Recipe ] = response . parsed Note: Pydantic validators are not yet supported. If a pydantic.ValidationError occurs, it is suppressed, and .parsed may be empty/null. JavaScript import { GoogleGenAI , Type } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : "List a few popular cookie recipes, and include the amounts of ingredients." , config : { responseMimeType : "application/json" , responseSchema : { type : Type . ARRAY , items : { type : Type . OBJECT , properties : { recipeName : { type : Type . STRING , }, ingredients : { type : Type . ARRAY , items : { type : Type . STRING , }, }, }, propertyOrdering : [ "recipeName" , "ingredients" ], }, }, }, }); console . log ( response . text ); } main (); Go package main import ( "context" "fmt" "log" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } config := & genai . GenerateContentConfig { ResponseMIMEType : "application/json" , ResponseSchema : & genai . Schema { Type : genai . TypeArray , Items : & genai . Schema { Type : genai . TypeObject , Properties : map [ string ] * genai . Schema { "recipeName" : { Type : genai . TypeString }, "ingredients" : { Type : genai . TypeArray , Items : & genai . Schema { Type : genai . TypeString }, }, }, PropertyOrdering : [] string { "recipeName" , "ingredients" }, }, }, } result , err := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , genai . Text ( "List a few popular cookie recipes, and include the amounts of ingredients." ), config , ) if err != nil { log . Fatal ( err ) } fmt . Println ( result . Text ()) } REST curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -d '{ "contents": [{ "parts":[ { "text": "List a few popular cookie recipes, and include the amounts of ingredients." } ] }], "generationConfig": { "responseMimeType": "application/json", "responseSchema": { "type": "ARRAY", "items": { "type": "OBJECT", "properties": { "recipeName": { "type": "STRING" }, "ingredients": { "type": "ARRAY", "items": { "type": "STRING" } } }, "propertyOrdering": ["recipeName", "ingredients"] } } } }' 2 > /dev/null | head The output might look like this: [ { "recipeName" : "Chocolate Chip Cookies" , "ingredients" : [ "1 cup (2 sticks) unsalted butter, softened" , "3/4 cup granulated sugar" , "3/4 cup packed brown sugar" , "1 teaspoon vanilla extract" , "2 large eggs" , "2 1/4 cups all-purpose flour" , "1 teaspoon baking soda" , "1 teaspoon salt" , "2 cups chocolate chips" ] }, ... ] Providing a schema in a text prompt Instead of configuring a schema, you can supply a schema as natural language or pseudo-code in a text prompt. This method is not recommended , because it might produce lower quality output, and because the model is not constrained to follow the schema. Warning: Don't provide a schema in a text prompt if you're configuring a responseSchema . This can produce unexpected or low quality results. Here's a generic example of a schema provided in a text prompt: List a few popular cookie recipes, and include the amounts of ingredients. Produce JSON matching this specification: Recipe = { "recipeName": string, "ingredients": array } Return: array Since the model gets the schema from text in the prompt, you might have some flexibility in how you represent the schema. But when you supply a schema inline like this, the model is not actually constrained to return JSON. For a more deterministic, higher quality response, configure a schema on the model, and don't duplicate the schema in the text prompt. Generating enum values In some cases you might want the model to choose a single option from a list of options. To implement this behavior, you can pass an enum in your schema. You can use an enum option anywhere you could use a string in the responseSchema , because an enum is an array of strings. Like a JSON schema, an enum lets you constrain model output to meet the requirements of your application. For example, assume that you're developing an application to classify musical instruments into one of five categories: "Percussion" , "String" , "Woodwind" , "Brass" , or " "Keyboard" ". You could create an enum to help with this task. In the following example, you pass an enum as the responseSchema , constraining the model to choose the most appropriate option. Python from google import genai import enum class Instrument ( enum . Enum ): PERCUSSION = "Percussion" STRING = "String" WOODWIND = "Woodwind" BRASS = "Brass" KEYBOARD = "Keyboard" client = genai . Client () response = client . models . generate_content ( model = 'gemini-2.5-flash' , contents = 'What type of instrument is an oboe?' , config = { 'response_mime_type' : 'text/x.enum' , 'response_schema' : Instrument , }, ) print ( response . text ) # Woodwind JavaScript import { GoogleGenAI , Type } from "@google/genai" ; const ai = new GoogleGenAI ({}); const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : "What type of instrument is an oboe?" , config : { responseMimeType : "text/x.enum" , responseSchema : { type : Type . STRING , enum : [ "Percussion" , "String" , "Woodwind" , "Brass" , "Keyboard" ], }, }, }); console . log ( response . text ); REST curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -d '{ "contents": [{ "parts":[ { "text": "What type of instrument is an oboe?" } ] }], "generationConfig": { "responseMimeType": "text/x.enum", "responseSchema": { "type": "STRING", "enum": ["Percussion", "String", "Woodwind", "Brass", "Keyboard"] } } }' The Python library will translate the type declarations for the API. However, the API accepts a subset of the OpenAPI 3.0 schema ( Schema ). There are two other ways to specify an enumeration. You can use a Literal : ``` Python Literal [ "Percussion" , "String" , "Woodwind" , "Brass" , "Keyboard" ] And you can also pass the schema as JSON: Python from google import genai client = genai . Client () response = client . models . generate_content ( model = 'gemini-2.5-flash' , contents = 'What type of instrument is an oboe?' , config = { 'response_mime_type' : 'text/x.enum' , 'response_schema' : { "type" : "STRING" , "enum" : [ "Percussion" , "String" , "Woodwind" , "Brass" , "Keyboard" ], }, }, ) print ( response . text ) # Woodwind Beyond basic multiple choice problems, you can use an enum anywhere in a JSON schema. For example, you could ask the model for a list of recipe titles and use a Grade enum to give each title a popularity grade: Python from google import genai import enum from pydantic import BaseModel class Grade ( enum . Enum ): A_PLUS = "a+" A = "a" B = "b" C = "c" D = "d" F = "f" class Recipe ( BaseModel ): recipe_name : str rating : Grade client = genai . Client () response = client . models . generate_content ( model = 'gemini-2.5-flash' , contents = 'List 10 home-baked cookie recipes and give them grades based on tastiness.' , config = { 'response_mime_type' : 'application/json' , 'response_schema' : list [ Recipe ], }, ) print ( response . text ) The response might look like this: [ { "recipe_name" : "Chocolate Chip Cookies" , "rating" : "a+" }, { "recipe_name" : "Peanut Butter Cookies" , "rating" : "a" }, { "recipe_name" : "Oatmeal Raisin Cookies" , "rating" : "b" }, ... ] About JSON schemas Configuring the model for JSON output using responseSchema parameter relies on Schema object to define its structure. This object represents a select subset of the OpenAPI 3.0 Schema object , and also adds a propertyOrdering field. Tip: On Python, when you use a Pydantic model, you don't need to directly work with Schema objects, as it gets automatically converted to the corresponding JSON schema. To learn more, see JSON schemas in Python . Here's a pseudo-JSON representation of all the Schema fields: { "type": enum (Type), "format": string, "description": string, "nullable": boolean, "enum": [ string ], "maxItems": integer, "minItems": integer, "properties": { string: { object (Schema) }, ... }, "required": [ string ], "propertyOrdering": [ string ], "items": { object (Schema) } } The Type of the schema must be one of the OpenAPI Data Types , or a union of those types (using anyOf ). Only a subset of fields is valid for each Type . The following list maps each Type to a subset of the fields that are valid for that type: string -> enum , format , nullable integer -> format , minimum , maximum , enum , nullable number -> format , minimum , maximum , enum , nullable boolean -> nullable array -> minItems , maxItems , items , nullable object -> properties , required , propertyOrdering , nullable Here are some example schemas showing valid type-and-field combinations: { "type" : "string" , "enum" : [ "a" , "b" , "c" ] } { "type" : "string" , "format" : "date-time" } { "type" : "integer" , "format" : "int64" } { "type" : "number" , "format" : "double" } { "type" : "boolean" } { "type" : "array" , "minItems" : 3 , "maxItems" : 3 , "items" : { "type" : ... } } { "type" : "object" , "properties" : { "a" : { "type" : ... }, "b" : { "type" : ... }, "c" : { "type" : ... } }, "nullable" : true , "required" : [ "c" ], "propertyOrdering" : [ "c" , "b" , "a" ] } For complete documentation of the Schema fields as they're used in the Gemini API, see the Schema reference . Property ordering Warning: When you're configuring a JSON schema, make sure to set propertyOrdering[] , and when you provide examples, make sure that the property ordering in the examples matches the schema. When you're working with JSON schemas in the Gemini API, the order of properties is important. By default, the API orders properties alphabetically and does not preserve the order in which the properties are defined (although the Google Gen AI SDKs may preserve this order). If you're providing examples to the model with a schema configured, and the property ordering of the examples is not consistent with the property ordering of the schema, the output could be rambling or unexpected. To ensure a consistent, predictable ordering of properties, you can use the optional propertyOrdering[] field. "propertyOrdering" : [ "recipeName" , "ingredients" ] propertyOrdering[] – not a standard field in the OpenAPI specification – is an array of strings used to determine the order of properties in the response. By specifying the order of properties and then providing examples with properties in that same order, you can potentially improve the quality of results. propertyOrdering is only supported when you manually create types.Schema . Schemas in Python When you're using the Python library, the value of response_schema must be one of the following: A type, as you would use in a type annotation (see the Python typing module ) An instance of genai.types.Schema The dict equivalent of genai.types.Schema The easiest way to define a schema is with a Pydantic type (as shown in the previous example): Python config = { 'response_mime_type' : 'application/json' , 'response_schema' : list [ Recipe ]} When you use a Pydantic type, the Python library builds out a JSON schema for you and sends it to the API. For additional examples, see the Python library docs . The Python library supports schemas defined with the following types (where AllowedType is any allowed type): int float bool str list[AllowedType] AllowedType|AllowedType|... For structured types: dict[str, AllowedType] . This annotation declares all dict values to be the same type, but doesn't specify what keys should be included. User-defined Pydantic models . This approach lets you specify the key names and define different types for the values associated with each of the keys, including nested structures. JSON Schema support JSON Schema is a more recent specification than OpenAPI 3.0, which the Schema object is based on. Support for JSON Schema is available as a preview using the field responseJsonSchema which accepts any JSON Schema with the following limitations: It only works with Gemini 2.5. While all JSON Schema properties can be passed, not all are supported. See the documentation for the field for more details. Recursive references can only be used as the value of a non-required object property. Recursive references are unrolled to a finite degree, based on the size of the schema. Schemas that contain $ref cannot contain any properties other than those starting with a $ . Here's an example of generating a JSON Schema with Pydantic and submitting it to the model: curl "https://generativelanguage.googleapis.com/v1alpha/models/\ gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -d @- < /dev/null | head The output might look like this: [ { "recipeName" : "Chocolate Chip Cookies" , "ingredients" : [ "1 cup (2 sticks) unsalted butter, softened" , "3/4 cup granulated sugar" , "3/4 cup packed brown sugar" , "1 teaspoon vanilla extract" , "2 large eggs" , "2 1/4 cups all-purpose flour" , "1 teaspoon baking soda" , "1 teaspoon salt" , "2 cups chocolate chips" ] }, ... ] Providing a schema in a text prompt Instead of configuring a schema, you can supply a schema as natural language or pseudo-code in a text prompt. This method is not recommended , because it might produce lower quality output, and because the model is not constrained to follow the schema. Warning: Don't provide a schema in a text prompt if you're configuring a responseSchema . This can produce unexpected or low quality results. Here's a generic example of a schema provided in a text prompt: List a few popular cookie recipes, and include the amounts of ingredients. Produce JSON matching this specification: Recipe = { "recipeName": string, "ingredients": array } Return: array Since the model gets the schema from text in the prompt, you might have some flexibility in how you represent the schema. But when you supply a schema inline like this, the model is not actually constrained to return JSON. For a more deterministic, higher quality response, configure a schema on the model, and don't duplicate the schema in the text prompt. Generating enum values In some cases you might want the model to choose a single option from a list of options. To implement this behavior, you can pass an enum in your schema. You can use an enum option anywhere you could use a string in the responseSchema , because an enum is an array of strings. Like a JSON schema, an enum lets you constrain model output to meet the requirements of your application. For example, assume that you're developing an application to classify musical instruments into one of five categories: "Percussion" , "String" , "Woodwind" , "Brass" , or " "Keyboard" ". You could create an enum to help with this task. In the following example, you pass an enum as the responseSchema , constraining the model to choose the most appropriate option. Python from google import genai import enum class Instrument ( enum . Enum ): PERCUSSION = "Percussion" STRING = "String" WOODWIND = "Woodwind" BRASS = "Brass" KEYBOARD = "Keyboard" client = genai . Client () response = client . models . generate_content ( model = 'gemini-2.5-flash' , contents = 'What type of instrument is an oboe?' , config = { 'response_mime_type' : 'text/x.enum' , 'response_schema' : Instrument , }, ) print ( response . text ) # Woodwind JavaScript import { GoogleGenAI , Type } from "@google/genai" ; const ai = new GoogleGenAI ({}); const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : "What type of instrument is an oboe?" , config : { responseMimeType : "text/x.enum" , responseSchema : { type : Type . STRING , enum : [ "Percussion" , "String" , "Woodwind" , "Brass" , "Keyboard" ], }, }, }); console . log ( response . text ); REST curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -d '{ "contents": [{ "parts":[ { "text": "What type of instrument is an oboe?" } ] }], "generationConfig": { "responseMimeType": "text/x.enum", "responseSchema": { "type": "STRING", "enum": ["Percussion", "String", "Woodwind", "Brass", "Keyboard"] } } }' The Python library will translate the type declarations for the API. However, the API accepts a subset of the OpenAPI 3.0 schema ( Schema ). There are two other ways to specify an enumeration. You can use a Literal : ``` Python Literal [ "Percussion" , "String" , "Woodwind" , "Brass" , "Keyboard" ] And you can also pass the schema as JSON: Python from google import genai client = genai . Client () response = client . models . generate_content ( model = 'gemini-2.5-flash' , contents = 'What type of instrument is an oboe?' , config = { 'response_mime_type' : 'text/x.enum' , 'response_schema' : { "type" : "STRING" , "enum" : [ "Percussion" , "String" , "Woodwind" , "Brass" , "Keyboard" ], }, }, ) print ( response . text ) # Woodwind Beyond basic multiple choice problems, you can use an enum anywhere in a JSON schema. For example, you could ask the model for a list of recipe titles and use a Grade enum to give each title a popularity grade: Python from google import genai import enum from pydantic import BaseModel class Grade ( enum . Enum ): A_PLUS = "a+" A = "a" B = "b" C = "c" D = "d" F = "f" class Recipe ( BaseModel ): recipe_name : str rating : Grade client = genai . Client () response = client . models . generate_content ( model = 'gemini-2.5-flash' , contents = 'List 10 home-baked cookie recipes and give them grades based on tastiness.' , config = { 'response_mime_type' : 'application/json' , 'response_schema' : list [ Recipe ], }, ) print ( response . text ) The response might look like this: [ { "recipe_name" : "Chocolate Chip Cookies" , "rating" : "a+" }, { "recipe_name" : "Peanut Butter Cookies" , "rating" : "a" }, { "recipe_name" : "Oatmeal Raisin Cookies" , "rating" : "b" }, ... ] About JSON schemas Configuring the model for JSON output using responseSchema parameter relies on Schema object to define its structure. This object represents a select subset of the OpenAPI 3.0 Schema object , and also adds a propertyOrdering field. Tip: On Python, when you use a Pydantic model, you don't need to directly work with Schema objects, as it gets automatically converted to the corresponding JSON schema. To learn more, see JSON schemas in Python . Here's a pseudo-JSON representation of all the Schema fields: { "type": enum (Type), "format": string, "description": string, "nullable": boolean, "enum": [ string ], "maxItems": integer, "minItems": integer, "properties": { string: { object (Schema) }, ... }, "required": [ string ], "propertyOrdering": [ string ], "items": { object (Schema) } } The Type of the schema must be one of the OpenAPI Data Types , or a union of those types (using anyOf ). Only a subset of fields is valid for each Type . The following list maps each Type to a subset of the fields that are valid for that type: string -> enum , format , nullable integer -> format , minimum , maximum , enum , nullable number -> format , minimum , maximum , enum , nullable boolean -> nullable array -> minItems , maxItems , items , nullable object -> properties , required , propertyOrdering , nullable Here are some example schemas showing valid type-and-field combinations: { "type" : "string" , "enum" : [ "a" , "b" , "c" ] } { "type" : "string" , "format" : "date-time" } { "type" : "integer" , "format" : "int64" } { "type" : "number" , "format" : "double" } { "type" : "boolean" } { "type" : "array" , "minItems" : 3 , "maxItems" : 3 , "items" : { "type" : ... } } { "type" : "object" , "properties" : { "a" : { "type" : ... }, "b" : { "type" : ... }, "c" : { "type" : ... } }, "nullable" : true , "required" : [ "c" ], "propertyOrdering" : [ "c" , "b" , "a" ] } For complete documentation of the Schema fields as they're used in the Gemini API, see the Schema reference . Property ordering Warning: When you're configuring a JSON schema, make sure to set propertyOrdering[] , and when you provide examples, make sure that the property ordering in the examples matches the schema. When you're working with JSON schemas in the Gemini API, the order of properties is important. By default, the API orders properties alphabetically and does not preserve the order in which the properties are defined (although the Google Gen AI SDKs may preserve this order). If you're providing examples to the model with a schema configured, and the property ordering of the examples is not consistent with the property ordering of the schema, the output could be rambling or unexpected. To ensure a consistent, predictable ordering of properties, you can use the optional propertyOrdering[] field. "propertyOrdering" : [ "recipeName" , "ingredients" ] propertyOrdering[] – not a standard field in the OpenAPI specification – is an array of strings used to determine the order of properties in the response. By specifying the order of properties and then providing examples with properties in that same order, you can potentially improve the quality of results. propertyOrdering is only supported when you manually create types.Schema . Schemas in Python When you're using the Python library, the value of response_schema must be one of the following: A type, as you would use in a type annotation (see the Python typing module ) An instance of genai.types.Schema The dict equivalent of genai.types.Schema The easiest way to define a schema is with a Pydantic type (as shown in the previous example): Python config = { 'response_mime_type' : 'application/json' , 'response_schema' : list [ Recipe ]} When you use a Pydantic type, the Python library builds out a JSON schema for you and sends it to the API. For additional examples, see the Python library docs . The Python library supports schemas defined with the following types (where AllowedType is any allowed type): int float bool str list[AllowedType] AllowedType|AllowedType|... For structured types: dict[str, AllowedType] . This annotation declares all dict values to be the same type, but doesn't specify what keys should be included. User-defined Pydantic models . This approach lets you specify the key names and define different types for the values associated with each of the keys, including nested structures. JSON Schema support JSON Schema is a more recent specification than OpenAPI 3.0, which the Schema object is based on. Support for JSON Schema is available as a preview using the field responseJsonSchema which accepts any JSON Schema with the following limitations: It only works with Gemini 2.5. While all JSON Schema properties can be passed, not all are supported. See the documentation for the field for more details. Recursive references can only be used as the value of a non-required object property. Recursive references are unrolled to a finite degree, based on the size of the schema. Schemas that contain $ref cannot contain any properties other than those starting with a $ . Here's an example of generating a JSON Schema with Pydantic and submitting it to the model: curl "https://generativelanguage.googleapis.com/v1alpha/models/\ gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -d @- < " $TEMP_B64 " # Use a temporary file to hold the JSON payload TEMP_JSON = $( mktemp ) trap 'rm -f "$TEMP_JSON"' EXIT cat > " $TEMP_JSON " << EOF { "contents" : [ { "parts" : [ { "text" : "Tell me about this instrument" } , { "inline_data" : { "mime_type" : "image/jpeg" , "data" : " $( cat " $TEMP_B64 " ) " } } ] } ] } EOF curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d "@ $TEMP_JSON " Apps Script // See https://developers.google.com/apps-script/guides/properties // for instructions on how to set the API key. const apiKey = PropertiesService . getScriptProperties (). getProperty ( 'GEMINI_API_KEY' ); function main () { const imageUrl = 'http://image/url' ; const image = getImageData ( imageUrl ); const payload = { contents : [ { parts : [ { image }, { text : 'Tell me about this instrument' }, ], }, ], }; const url = 'https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent' ; const options = { method : 'POST' , contentType : 'application/json' , headers : { 'x-goog-api-key' : apiKey , }, payload : JSON . stringify ( payload ) }; const response = UrlFetchApp . fetch ( url , options ); const data = JSON . parse ( response ); const content = data [ 'candidates' ][ 0 ][ 'content' ][ 'parts' ][ 0 ][ 'text' ]; console . log ( content ); } function getImageData ( url ) { const blob = UrlFetchApp . fetch ( url ). getBlob (); return { mimeType : blob . getContentType (), data : Utilities . base64Encode ( blob . getBytes ()) }; } For alternative methods of providing images and more advanced image processing, see our image understanding guide . The API also supports document , video , and audio inputs and understanding. Streaming responses By default, the model returns a response only after the entire generation process is complete. For more fluid interactions, use streaming to receive GenerateContentResponse instances incrementally as they're generated. Python from google import genai client = genai . Client () response = client . models . generate_content_stream ( model = "gemini-2.5-flash" , contents = [ "Explain how AI works" ] ) for chunk in response : print ( chunk . text , end = "" ) JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const response = await ai . models . generateContentStream ({ model : "gemini-2.5-flash" , contents : "Explain how AI works" , }); for await ( const chunk of response ) { console . log ( chunk . text ); } } await main (); Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } stream := client . Models . GenerateContentStream ( ctx , "gemini-2.5-flash" , genai . Text ( "Write a story about a magic backpack." ), nil , ) for chunk , _ := range stream { part := chunk . Candidates [ 0 ]. Content . Parts [ 0 ] fmt . Print ( part . Text ) } } REST curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:streamGenerateContent?alt=sse" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ --no-buffer \ -d '{ "contents": [ { "parts": [ { "text": "Explain how AI works" } ] } ] }' Apps Script // See https://developers.google.com/apps-script/guides/properties // for instructions on how to set the API key. const apiKey = PropertiesService . getScriptProperties (). getProperty ( 'GEMINI_API_KEY' ); function main () { const payload = { contents : [ { parts : [ { text : 'Explain how AI works' }, ], }, ], }; const url = 'https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:streamGenerateContent' ; const options = { method : 'POST' , contentType : 'application/json' , headers : { 'x-goog-api-key' : apiKey , }, payload : JSON . stringify ( payload ) }; const response = UrlFetchApp . fetch ( url , options ); const data = JSON . parse ( response ); const content = data [ 'candidates' ][ 0 ][ 'content' ][ 'parts' ][ 0 ][ 'text' ]; console . log ( content ); } Multi-turn conversations (Chat) Our SDKs provide functionality to collect multiple rounds of prompts and responses into a chat, giving you an easy way to keep track of the conversation history. Note: Chat functionality is only implemented as part of the SDKs. Behind the scenes, it still uses the generateContent API. For multi-turn conversations, the full conversation history is sent to the model with each follow-up turn. Python from google import genai client = genai . Client () chat = client . chats . create ( model = "gemini-2.5-flash" ) response = chat . send_message ( "I have 2 dogs in my house." ) print ( response . text ) response = chat . send_message ( "How many paws are in my house?" ) print ( response . text ) for message in chat . get_history (): print ( f 'role - { message . role } ' , end = ": " ) print ( message . parts [ 0 ] . text ) JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const chat = ai . chats . create ({ model : "gemini-2.5-flash" , history : [ { role : "user" , parts : [{ text : "Hello" }], }, { role : "model" , parts : [{ text : "Great to meet you. What would you like to know?" }], }, ], }); const response1 = await chat . sendMessage ({ message : "I have 2 dogs in my house." , }); console . log ( "Chat response 1:" , response1 . text ); const response2 = await chat . sendMessage ({ message : "How many paws are in my house?" , }); console . log ( "Chat response 2:" , response2 . text ); } await main (); Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } history := [] * genai . Content { genai . NewContentFromText ( "Hi nice to meet you! I have 2 dogs in my house." , genai . RoleUser ), genai . NewContentFromText ( "Great to meet you. What would you like to know?" , genai . RoleModel ), } chat , _ := client . Chats . Create ( ctx , "gemini-2.5-flash" , nil , history ) res , _ := chat . SendMessage ( ctx , genai . Part { Text : "How many paws are in my house?" }) if len ( res . Candidates ) > 0 { fmt . Println ( res . Candidates [ 0 ]. Content . Parts [ 0 ]. Text ) } } REST curl https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [ { "role": "user", "parts": [ { "text": "Hello" } ] }, { "role": "model", "parts": [ { "text": "Great to meet you. What would you like to know?" } ] }, { "role": "user", "parts": [ { "text": "I have two dogs in my house. How many paws are in my house?" } ] } ] }' Apps Script // See https://developers.google.com/apps-script/guides/properties // for instructions on how to set the API key. const apiKey = PropertiesService . getScriptProperties (). getProperty ( 'GEMINI_API_KEY' ); function main () { const payload = { contents : [ { role : 'user' , parts : [ { text : 'Hello' }, ], }, { role : 'model' , parts : [ { text : 'Great to meet you. What would you like to know?' }, ], }, { role : 'user' , parts : [ { text : 'I have two dogs in my house. How many paws are in my house?' }, ], }, ], }; const url = 'https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent' ; const options = { method : 'POST' , contentType : 'application/json' , headers : { 'x-goog-api-key' : apiKey , }, payload : JSON . stringify ( payload ) }; const response = UrlFetchApp . fetch ( url , options ); const data = JSON . parse ( response ); const content = data [ 'candidates' ][ 0 ][ 'content' ][ 'parts' ][ 0 ][ 'text' ]; console . log ( content ); } Streaming can also be used for multi-turn conversations. Python from google import genai client = genai . Client () chat = client . chats . create ( model = "gemini-2.5-flash" ) response = chat . send_message_stream ( "I have 2 dogs in my house." ) for chunk in response : print ( chunk . text , end = "" ) response = chat . send_message_stream ( "How many paws are in my house?" ) for chunk in response : print ( chunk . text , end = "" ) for message in chat . get_history (): print ( f 'role - { message . role } ' , end = ": " ) print ( message . parts [ 0 ] . text ) JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const chat = ai . chats . create ({ model : "gemini-2.5-flash" , history : [ { role : "user" , parts : [{ text : "Hello" }], }, { role : "model" , parts : [{ text : "Great to meet you. What would you like to know?" }], }, ], }); const stream1 = await chat . sendMessageStream ({ message : "I have 2 dogs in my house." , }); for await ( const chunk of stream1 ) { console . log ( chunk . text ); console . log ( "_" . repeat ( 80 )); } const stream2 = await chat . sendMessageStream ({ message : "How many paws are in my house?" , }); for await ( const chunk of stream2 ) { console . log ( chunk . text ); console . log ( "_" . repeat ( 80 )); } } await main (); Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } history := [] * genai . Content { genai . NewContentFromText ( "Hi nice to meet you! I have 2 dogs in my house." , genai . RoleUser ), genai . NewContentFromText ( "Great to meet you. What would you like to know?" , genai . RoleModel ), } chat , _ := client . Chats . Create ( ctx , "gemini-2.5-flash" , nil , history ) stream := chat . SendMessageStream ( ctx , genai . Part { Text : "How many paws are in my house?" }) for chunk , _ := range stream { part := chunk . Candidates [ 0 ]. Content . Parts [ 0 ] fmt . Print ( part . Text ) } } REST curl https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:streamGenerateContent?alt = sse \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [ { "role": "user", "parts": [ { "text": "Hello" } ] }, { "role": "model", "parts": [ { "text": "Great to meet you. What would you like to know?" } ] }, { "role": "user", "parts": [ { "text": "I have two dogs in my house. How many paws are in my house?" } ] } ] }' Apps Script // See https://developers.google.com/apps-script/guides/properties // for instructions on how to set the API key. const apiKey = PropertiesService . getScriptProperties (). getProperty ( 'GEMINI_API_KEY' ); function main () { const payload = { contents : [ { role : 'user' , parts : [ { text : 'Hello' }, ], }, { role : 'model' , parts : [ { text : 'Great to meet you. What would you like to know?' }, ], }, { role : 'user' , parts : [ { text : 'I have two dogs in my house. How many paws are in my house?' }, ], }, ], }; const url = 'https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:streamGenerateContent' ; const options = { method : 'POST' , contentType : 'application/json' , headers : { 'x-goog-api-key' : apiKey , }, payload : JSON . stringify ( payload ) }; const response = UrlFetchApp . fetch ( url , options ); const data = JSON . parse ( response ); const content = data [ 'candidates' ][ 0 ][ 'content' ][ 'parts' ][ 0 ][ 'text' ]; console . log ( content ); } Supported models All models in the Gemini family support text generation. To learn more about the models and their capabilities, visit the Models page. Best practices Prompting tips For basic text generation, a zero-shot prompt often suffices without needing examples, system instructions or specific formatting. For more tailored outputs: Use System instructions to guide the model. Provide few example inputs and outputs to guide the model. This is often referred to as few-shot prompting. Consult our prompt engineering guide for more tips. Structured output In some cases, you may need structured output, such as JSON. Refer to our structured output guide to learn how. What's next Try the Gemini API getting started Colab . Explore Gemini's image , video , audio and document understanding capabilities. Learn about multimodal file prompting strategies . Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-06-27 UTC. \ No newline at end of file diff --git a/text_content/docs_text-generation_3fd5e0fe.txt b/text_content/docs_text-generation_3fd5e0fe.txt new file mode 100644 index 0000000000000000000000000000000000000000..d22925af4013a2a705d3efa9cd0702ab8babad7d --- /dev/null +++ b/text_content/docs_text-generation_3fd5e0fe.txt @@ -0,0 +1,5 @@ +URL: https://ai.google.dev/gemini-api/docs/text-generation#system-instructions +Title: Text generation  |  Gemini API  |  Google AI for Developers +================================================== + +Text generation | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Text generation The Gemini API can generate text output from various inputs, including text, images, video, and audio, leveraging Gemini models. Here's a basic example that takes a single text input: Python from google import genai client = genai . Client () response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = "How does AI work?" ) print ( response . text ) JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : "How does AI work?" , }); console . log ( response . text ); } await main (); Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , genai . Text ( "Explain how AI works in a few words" ), nil , ) fmt . Println ( result . Text ()) } REST curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [ { "parts": [ { "text": "How does AI work?" } ] } ] }' Apps Script // See https://developers.google.com/apps-script/guides/properties // for instructions on how to set the API key. const apiKey = PropertiesService . getScriptProperties (). getProperty ( 'GEMINI_API_KEY' ); function main () { const payload = { contents : [ { parts : [ { text : 'How AI does work?' }, ], }, ], }; const url = 'https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent' ; const options = { method : 'POST' , contentType : 'application/json' , headers : { 'x-goog-api-key' : apiKey , }, payload : JSON . stringify ( payload ) }; const response = UrlFetchApp . fetch ( url , options ); const data = JSON . parse ( response ); const content = data [ 'candidates' ][ 0 ][ 'content' ][ 'parts' ][ 0 ][ 'text' ]; console . log ( content ); } Thinking with Gemini 2.5 2.5 Flash and Pro models have "thinking" enabled by default to enhance quality, which may take longer to run and increase token usage. When using 2.5 Flash, you can disable thinking by setting the thinking budget to zero. For more details, see the thinking guide . Python from google import genai from google.genai import types client = genai . Client () response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = "How does AI work?" , config = types . GenerateContentConfig ( thinking_config = types . ThinkingConfig ( thinking_budget = 0 ) # Disables thinking ), ) print ( response . text ) JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : "How does AI work?" , config : { thinkingConfig : { thinkingBudget : 0 , // Disables thinking }, } }); console . log ( response . text ); } await main (); Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , genai . Text ( "How does AI work?" ), & genai . GenerateContentConfig { ThinkingConfig : & genai . ThinkingConfig { ThinkingBudget : int32 ( 0 ), // Disables thinking }, } ) fmt . Println ( result . Text ()) } REST curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [ { "parts": [ { "text": "How does AI work?" } ] } ], "generationConfig": { "thinkingConfig": { "thinkingBudget": 0 } } }' Apps Script // See https://developers.google.com/apps-script/guides/properties // for instructions on how to set the API key. const apiKey = PropertiesService . getScriptProperties (). getProperty ( 'GEMINI_API_KEY' ); function main () { const payload = { contents : [ { parts : [ { text : 'How AI does work?' }, ], }, ], }; const url = 'https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent' ; const options = { method : 'POST' , contentType : 'application/json' , headers : { 'x-goog-api-key' : apiKey , }, payload : JSON . stringify ( payload ) }; const response = UrlFetchApp . fetch ( url , options ); const data = JSON . parse ( response ); const content = data [ 'candidates' ][ 0 ][ 'content' ][ 'parts' ][ 0 ][ 'text' ]; console . log ( content ); } System instructions and other configurations You can guide the behavior of Gemini models with system instructions. To do so, pass a GenerateContentConfig object. Python from google import genai from google.genai import types client = genai . Client () response = client . models . generate_content ( model = "gemini-2.5-flash" , config = types . GenerateContentConfig ( system_instruction = "You are a cat. Your name is Neko." ), contents = "Hello there" ) print ( response . text ) JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : "Hello there" , config : { systemInstruction : "You are a cat. Your name is Neko." , }, }); console . log ( response . text ); } await main (); Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } config := & genai . GenerateContentConfig { SystemInstruction : genai . NewContentFromText ( "You are a cat. Your name is Neko." , genai . RoleUser ), } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , genai . Text ( "Hello there" ), config , ) fmt . Println ( result . Text ()) } REST curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -d '{ "system_instruction": { "parts": [ { "text": "You are a cat. Your name is Neko." } ] }, "contents": [ { "parts": [ { "text": "Hello there" } ] } ] }' Apps Script // See https://developers.google.com/apps-script/guides/properties // for instructions on how to set the API key. const apiKey = PropertiesService . getScriptProperties (). getProperty ( 'GEMINI_API_KEY' ); function main () { const systemInstruction = { parts : [{ text : 'You are a cat. Your name is Neko.' }] }; const payload = { systemInstruction , contents : [ { parts : [ { text : 'Hello there' }, ], }, ], }; const url = 'https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent' ; const options = { method : 'POST' , contentType : 'application/json' , headers : { 'x-goog-api-key' : apiKey , }, payload : JSON . stringify ( payload ) }; const response = UrlFetchApp . fetch ( url , options ); const data = JSON . parse ( response ); const content = data [ 'candidates' ][ 0 ][ 'content' ][ 'parts' ][ 0 ][ 'text' ]; console . log ( content ); } The GenerateContentConfig object also lets you override default generation parameters, such as temperature . Python from google import genai from google.genai import types client = genai . Client () response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = [ "Explain how AI works" ], config = types . GenerateContentConfig ( temperature = 0.1 ) ) print ( response . text ) JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : "Explain how AI works" , config : { temperature : 0.1 , }, }); console . log ( response . text ); } await main (); Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } temp := float32 ( 0.9 ) topP := float32 ( 0.5 ) topK := float32 ( 20.0 ) config := & genai . GenerateContentConfig { Temperature : & temp , TopP : & topP , TopK : & topK , ResponseMIMEType : "application/json" , } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , genai . Text ( "What is the average size of a swallow?" ), config , ) fmt . Println ( result . Text ()) } REST curl https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [ { "parts": [ { "text": "Explain how AI works" } ] } ], "generationConfig": { "stopSequences": [ "Title" ], "temperature": 1.0, "topP": 0.8, "topK": 10 } }' Apps Script // See https://developers.google.com/apps-script/guides/properties // for instructions on how to set the API key. const apiKey = PropertiesService . getScriptProperties (). getProperty ( 'GEMINI_API_KEY' ); function main () { const generationConfig = { temperature : 1 , topP : 0.95 , topK : 40 , responseMimeType : 'text/plain' , }; const payload = { generationConfig , contents : [ { parts : [ { text : 'Explain how AI works in a few words' }, ], }, ], }; const url = 'https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent' ; const options = { method : 'POST' , contentType : 'application/json' , headers : { 'x-goog-api-key' : apiKey , }, payload : JSON . stringify ( payload ) }; const response = UrlFetchApp . fetch ( url , options ); const data = JSON . parse ( response ); const content = data [ 'candidates' ][ 0 ][ 'content' ][ 'parts' ][ 0 ][ 'text' ]; console . log ( content ); } Refer to the GenerateContentConfig in our API reference for a complete list of configurable parameters and their descriptions. Multimodal inputs The Gemini API supports multimodal inputs, allowing you to combine text with media files. The following example demonstrates providing an image: Python from PIL import Image from google import genai client = genai . Client () image = Image . open ( "/path/to/organ.png" ) response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = [ image , "Tell me about this instrument" ] ) print ( response . text ) JavaScript import { GoogleGenAI , createUserContent , createPartFromUri , } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const image = await ai . files . upload ({ file : "/path/to/organ.png" , }); const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : [ createUserContent ([ "Tell me about this instrument" , createPartFromUri ( image . uri , image . mimeType ), ]), ], }); console . log ( response . text ); } await main (); Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } imagePath := "/path/to/organ.jpg" imgData , _ := os . ReadFile ( imagePath ) parts := [] * genai . Part { genai . NewPartFromText ( "Tell me about this instrument" ), & genai . Part { InlineData : & genai . Blob { MIMEType : "image/jpeg" , Data : imgData , }, }, } contents := [] * genai . Content { genai . NewContentFromParts ( parts , genai . RoleUser ), } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , contents , nil , ) fmt . Println ( result . Text ()) } REST # Use a temporary file to hold the base64 encoded image data TEMP_B64 = $( mktemp ) trap 'rm -f "$TEMP_B64"' EXIT base64 $B64FLAGS $IMG_PATH > " $TEMP_B64 " # Use a temporary file to hold the JSON payload TEMP_JSON = $( mktemp ) trap 'rm -f "$TEMP_JSON"' EXIT cat > " $TEMP_JSON " << EOF { "contents" : [ { "parts" : [ { "text" : "Tell me about this instrument" } , { "inline_data" : { "mime_type" : "image/jpeg" , "data" : " $( cat " $TEMP_B64 " ) " } } ] } ] } EOF curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d "@ $TEMP_JSON " Apps Script // See https://developers.google.com/apps-script/guides/properties // for instructions on how to set the API key. const apiKey = PropertiesService . getScriptProperties (). getProperty ( 'GEMINI_API_KEY' ); function main () { const imageUrl = 'http://image/url' ; const image = getImageData ( imageUrl ); const payload = { contents : [ { parts : [ { image }, { text : 'Tell me about this instrument' }, ], }, ], }; const url = 'https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent' ; const options = { method : 'POST' , contentType : 'application/json' , headers : { 'x-goog-api-key' : apiKey , }, payload : JSON . stringify ( payload ) }; const response = UrlFetchApp . fetch ( url , options ); const data = JSON . parse ( response ); const content = data [ 'candidates' ][ 0 ][ 'content' ][ 'parts' ][ 0 ][ 'text' ]; console . log ( content ); } function getImageData ( url ) { const blob = UrlFetchApp . fetch ( url ). getBlob (); return { mimeType : blob . getContentType (), data : Utilities . base64Encode ( blob . getBytes ()) }; } For alternative methods of providing images and more advanced image processing, see our image understanding guide . The API also supports document , video , and audio inputs and understanding. Streaming responses By default, the model returns a response only after the entire generation process is complete. For more fluid interactions, use streaming to receive GenerateContentResponse instances incrementally as they're generated. Python from google import genai client = genai . Client () response = client . models . generate_content_stream ( model = "gemini-2.5-flash" , contents = [ "Explain how AI works" ] ) for chunk in response : print ( chunk . text , end = "" ) JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const response = await ai . models . generateContentStream ({ model : "gemini-2.5-flash" , contents : "Explain how AI works" , }); for await ( const chunk of response ) { console . log ( chunk . text ); } } await main (); Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } stream := client . Models . GenerateContentStream ( ctx , "gemini-2.5-flash" , genai . Text ( "Write a story about a magic backpack." ), nil , ) for chunk , _ := range stream { part := chunk . Candidates [ 0 ]. Content . Parts [ 0 ] fmt . Print ( part . Text ) } } REST curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:streamGenerateContent?alt=sse" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ --no-buffer \ -d '{ "contents": [ { "parts": [ { "text": "Explain how AI works" } ] } ] }' Apps Script // See https://developers.google.com/apps-script/guides/properties // for instructions on how to set the API key. const apiKey = PropertiesService . getScriptProperties (). getProperty ( 'GEMINI_API_KEY' ); function main () { const payload = { contents : [ { parts : [ { text : 'Explain how AI works' }, ], }, ], }; const url = 'https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:streamGenerateContent' ; const options = { method : 'POST' , contentType : 'application/json' , headers : { 'x-goog-api-key' : apiKey , }, payload : JSON . stringify ( payload ) }; const response = UrlFetchApp . fetch ( url , options ); const data = JSON . parse ( response ); const content = data [ 'candidates' ][ 0 ][ 'content' ][ 'parts' ][ 0 ][ 'text' ]; console . log ( content ); } Multi-turn conversations (Chat) Our SDKs provide functionality to collect multiple rounds of prompts and responses into a chat, giving you an easy way to keep track of the conversation history. Note: Chat functionality is only implemented as part of the SDKs. Behind the scenes, it still uses the generateContent API. For multi-turn conversations, the full conversation history is sent to the model with each follow-up turn. Python from google import genai client = genai . Client () chat = client . chats . create ( model = "gemini-2.5-flash" ) response = chat . send_message ( "I have 2 dogs in my house." ) print ( response . text ) response = chat . send_message ( "How many paws are in my house?" ) print ( response . text ) for message in chat . get_history (): print ( f 'role - { message . role } ' , end = ": " ) print ( message . parts [ 0 ] . text ) JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const chat = ai . chats . create ({ model : "gemini-2.5-flash" , history : [ { role : "user" , parts : [{ text : "Hello" }], }, { role : "model" , parts : [{ text : "Great to meet you. What would you like to know?" }], }, ], }); const response1 = await chat . sendMessage ({ message : "I have 2 dogs in my house." , }); console . log ( "Chat response 1:" , response1 . text ); const response2 = await chat . sendMessage ({ message : "How many paws are in my house?" , }); console . log ( "Chat response 2:" , response2 . text ); } await main (); Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } history := [] * genai . Content { genai . NewContentFromText ( "Hi nice to meet you! I have 2 dogs in my house." , genai . RoleUser ), genai . NewContentFromText ( "Great to meet you. What would you like to know?" , genai . RoleModel ), } chat , _ := client . Chats . Create ( ctx , "gemini-2.5-flash" , nil , history ) res , _ := chat . SendMessage ( ctx , genai . Part { Text : "How many paws are in my house?" }) if len ( res . Candidates ) > 0 { fmt . Println ( res . Candidates [ 0 ]. Content . Parts [ 0 ]. Text ) } } REST curl https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [ { "role": "user", "parts": [ { "text": "Hello" } ] }, { "role": "model", "parts": [ { "text": "Great to meet you. What would you like to know?" } ] }, { "role": "user", "parts": [ { "text": "I have two dogs in my house. How many paws are in my house?" } ] } ] }' Apps Script // See https://developers.google.com/apps-script/guides/properties // for instructions on how to set the API key. const apiKey = PropertiesService . getScriptProperties (). getProperty ( 'GEMINI_API_KEY' ); function main () { const payload = { contents : [ { role : 'user' , parts : [ { text : 'Hello' }, ], }, { role : 'model' , parts : [ { text : 'Great to meet you. What would you like to know?' }, ], }, { role : 'user' , parts : [ { text : 'I have two dogs in my house. How many paws are in my house?' }, ], }, ], }; const url = 'https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent' ; const options = { method : 'POST' , contentType : 'application/json' , headers : { 'x-goog-api-key' : apiKey , }, payload : JSON . stringify ( payload ) }; const response = UrlFetchApp . fetch ( url , options ); const data = JSON . parse ( response ); const content = data [ 'candidates' ][ 0 ][ 'content' ][ 'parts' ][ 0 ][ 'text' ]; console . log ( content ); } Streaming can also be used for multi-turn conversations. Python from google import genai client = genai . Client () chat = client . chats . create ( model = "gemini-2.5-flash" ) response = chat . send_message_stream ( "I have 2 dogs in my house." ) for chunk in response : print ( chunk . text , end = "" ) response = chat . send_message_stream ( "How many paws are in my house?" ) for chunk in response : print ( chunk . text , end = "" ) for message in chat . get_history (): print ( f 'role - { message . role } ' , end = ": " ) print ( message . parts [ 0 ] . text ) JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const chat = ai . chats . create ({ model : "gemini-2.5-flash" , history : [ { role : "user" , parts : [{ text : "Hello" }], }, { role : "model" , parts : [{ text : "Great to meet you. What would you like to know?" }], }, ], }); const stream1 = await chat . sendMessageStream ({ message : "I have 2 dogs in my house." , }); for await ( const chunk of stream1 ) { console . log ( chunk . text ); console . log ( "_" . repeat ( 80 )); } const stream2 = await chat . sendMessageStream ({ message : "How many paws are in my house?" , }); for await ( const chunk of stream2 ) { console . log ( chunk . text ); console . log ( "_" . repeat ( 80 )); } } await main (); Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } history := [] * genai . Content { genai . NewContentFromText ( "Hi nice to meet you! I have 2 dogs in my house." , genai . RoleUser ), genai . NewContentFromText ( "Great to meet you. What would you like to know?" , genai . RoleModel ), } chat , _ := client . Chats . Create ( ctx , "gemini-2.5-flash" , nil , history ) stream := chat . SendMessageStream ( ctx , genai . Part { Text : "How many paws are in my house?" }) for chunk , _ := range stream { part := chunk . Candidates [ 0 ]. Content . Parts [ 0 ] fmt . Print ( part . Text ) } } REST curl https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:streamGenerateContent?alt = sse \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [ { "role": "user", "parts": [ { "text": "Hello" } ] }, { "role": "model", "parts": [ { "text": "Great to meet you. What would you like to know?" } ] }, { "role": "user", "parts": [ { "text": "I have two dogs in my house. How many paws are in my house?" } ] } ] }' Apps Script // See https://developers.google.com/apps-script/guides/properties // for instructions on how to set the API key. const apiKey = PropertiesService . getScriptProperties (). getProperty ( 'GEMINI_API_KEY' ); function main () { const payload = { contents : [ { role : 'user' , parts : [ { text : 'Hello' }, ], }, { role : 'model' , parts : [ { text : 'Great to meet you. What would you like to know?' }, ], }, { role : 'user' , parts : [ { text : 'I have two dogs in my house. How many paws are in my house?' }, ], }, ], }; const url = 'https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:streamGenerateContent' ; const options = { method : 'POST' , contentType : 'application/json' , headers : { 'x-goog-api-key' : apiKey , }, payload : JSON . stringify ( payload ) }; const response = UrlFetchApp . fetch ( url , options ); const data = JSON . parse ( response ); const content = data [ 'candidates' ][ 0 ][ 'content' ][ 'parts' ][ 0 ][ 'text' ]; console . log ( content ); } Supported models All models in the Gemini family support text generation. To learn more about the models and their capabilities, visit the Models page. Best practices Prompting tips For basic text generation, a zero-shot prompt often suffices without needing examples, system instructions or specific formatting. For more tailored outputs: Use System instructions to guide the model. Provide few example inputs and outputs to guide the model. This is often referred to as few-shot prompting. Consult our prompt engineering guide for more tips. Structured output In some cases, you may need structured output, such as JSON. Refer to our structured output guide to learn how. What's next Try the Gemini API getting started Colab . Explore Gemini's image , video , audio and document understanding capabilities. Learn about multimodal file prompting strategies . Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-06-27 UTC. \ No newline at end of file diff --git a/text_content/docs_text-generation_b6a5456c.txt b/text_content/docs_text-generation_b6a5456c.txt new file mode 100644 index 0000000000000000000000000000000000000000..acfa508083bbcc2457fbfc5bee378ddedf769fd3 --- /dev/null +++ b/text_content/docs_text-generation_b6a5456c.txt @@ -0,0 +1,5 @@ +URL: https://ai.google.dev/gemini-api/docs/text-generation#text-input +Title: Text generation  |  Gemini API  |  Google AI for Developers +================================================== + +Text generation | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Text generation The Gemini API can generate text output from various inputs, including text, images, video, and audio, leveraging Gemini models. Here's a basic example that takes a single text input: Python from google import genai client = genai . Client () response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = "How does AI work?" ) print ( response . text ) JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : "How does AI work?" , }); console . log ( response . text ); } await main (); Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , genai . Text ( "Explain how AI works in a few words" ), nil , ) fmt . Println ( result . Text ()) } REST curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [ { "parts": [ { "text": "How does AI work?" } ] } ] }' Apps Script // See https://developers.google.com/apps-script/guides/properties // for instructions on how to set the API key. const apiKey = PropertiesService . getScriptProperties (). getProperty ( 'GEMINI_API_KEY' ); function main () { const payload = { contents : [ { parts : [ { text : 'How AI does work?' }, ], }, ], }; const url = 'https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent' ; const options = { method : 'POST' , contentType : 'application/json' , headers : { 'x-goog-api-key' : apiKey , }, payload : JSON . stringify ( payload ) }; const response = UrlFetchApp . fetch ( url , options ); const data = JSON . parse ( response ); const content = data [ 'candidates' ][ 0 ][ 'content' ][ 'parts' ][ 0 ][ 'text' ]; console . log ( content ); } Thinking with Gemini 2.5 2.5 Flash and Pro models have "thinking" enabled by default to enhance quality, which may take longer to run and increase token usage. When using 2.5 Flash, you can disable thinking by setting the thinking budget to zero. For more details, see the thinking guide . Python from google import genai from google.genai import types client = genai . Client () response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = "How does AI work?" , config = types . GenerateContentConfig ( thinking_config = types . ThinkingConfig ( thinking_budget = 0 ) # Disables thinking ), ) print ( response . text ) JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : "How does AI work?" , config : { thinkingConfig : { thinkingBudget : 0 , // Disables thinking }, } }); console . log ( response . text ); } await main (); Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , genai . Text ( "How does AI work?" ), & genai . GenerateContentConfig { ThinkingConfig : & genai . ThinkingConfig { ThinkingBudget : int32 ( 0 ), // Disables thinking }, } ) fmt . Println ( result . Text ()) } REST curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [ { "parts": [ { "text": "How does AI work?" } ] } ], "generationConfig": { "thinkingConfig": { "thinkingBudget": 0 } } }' Apps Script // See https://developers.google.com/apps-script/guides/properties // for instructions on how to set the API key. const apiKey = PropertiesService . getScriptProperties (). getProperty ( 'GEMINI_API_KEY' ); function main () { const payload = { contents : [ { parts : [ { text : 'How AI does work?' }, ], }, ], }; const url = 'https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent' ; const options = { method : 'POST' , contentType : 'application/json' , headers : { 'x-goog-api-key' : apiKey , }, payload : JSON . stringify ( payload ) }; const response = UrlFetchApp . fetch ( url , options ); const data = JSON . parse ( response ); const content = data [ 'candidates' ][ 0 ][ 'content' ][ 'parts' ][ 0 ][ 'text' ]; console . log ( content ); } System instructions and other configurations You can guide the behavior of Gemini models with system instructions. To do so, pass a GenerateContentConfig object. Python from google import genai from google.genai import types client = genai . Client () response = client . models . generate_content ( model = "gemini-2.5-flash" , config = types . GenerateContentConfig ( system_instruction = "You are a cat. Your name is Neko." ), contents = "Hello there" ) print ( response . text ) JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : "Hello there" , config : { systemInstruction : "You are a cat. Your name is Neko." , }, }); console . log ( response . text ); } await main (); Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } config := & genai . GenerateContentConfig { SystemInstruction : genai . NewContentFromText ( "You are a cat. Your name is Neko." , genai . RoleUser ), } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , genai . Text ( "Hello there" ), config , ) fmt . Println ( result . Text ()) } REST curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -d '{ "system_instruction": { "parts": [ { "text": "You are a cat. Your name is Neko." } ] }, "contents": [ { "parts": [ { "text": "Hello there" } ] } ] }' Apps Script // See https://developers.google.com/apps-script/guides/properties // for instructions on how to set the API key. const apiKey = PropertiesService . getScriptProperties (). getProperty ( 'GEMINI_API_KEY' ); function main () { const systemInstruction = { parts : [{ text : 'You are a cat. Your name is Neko.' }] }; const payload = { systemInstruction , contents : [ { parts : [ { text : 'Hello there' }, ], }, ], }; const url = 'https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent' ; const options = { method : 'POST' , contentType : 'application/json' , headers : { 'x-goog-api-key' : apiKey , }, payload : JSON . stringify ( payload ) }; const response = UrlFetchApp . fetch ( url , options ); const data = JSON . parse ( response ); const content = data [ 'candidates' ][ 0 ][ 'content' ][ 'parts' ][ 0 ][ 'text' ]; console . log ( content ); } The GenerateContentConfig object also lets you override default generation parameters, such as temperature . Python from google import genai from google.genai import types client = genai . Client () response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = [ "Explain how AI works" ], config = types . GenerateContentConfig ( temperature = 0.1 ) ) print ( response . text ) JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : "Explain how AI works" , config : { temperature : 0.1 , }, }); console . log ( response . text ); } await main (); Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } temp := float32 ( 0.9 ) topP := float32 ( 0.5 ) topK := float32 ( 20.0 ) config := & genai . GenerateContentConfig { Temperature : & temp , TopP : & topP , TopK : & topK , ResponseMIMEType : "application/json" , } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , genai . Text ( "What is the average size of a swallow?" ), config , ) fmt . Println ( result . Text ()) } REST curl https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [ { "parts": [ { "text": "Explain how AI works" } ] } ], "generationConfig": { "stopSequences": [ "Title" ], "temperature": 1.0, "topP": 0.8, "topK": 10 } }' Apps Script // See https://developers.google.com/apps-script/guides/properties // for instructions on how to set the API key. const apiKey = PropertiesService . getScriptProperties (). getProperty ( 'GEMINI_API_KEY' ); function main () { const generationConfig = { temperature : 1 , topP : 0.95 , topK : 40 , responseMimeType : 'text/plain' , }; const payload = { generationConfig , contents : [ { parts : [ { text : 'Explain how AI works in a few words' }, ], }, ], }; const url = 'https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent' ; const options = { method : 'POST' , contentType : 'application/json' , headers : { 'x-goog-api-key' : apiKey , }, payload : JSON . stringify ( payload ) }; const response = UrlFetchApp . fetch ( url , options ); const data = JSON . parse ( response ); const content = data [ 'candidates' ][ 0 ][ 'content' ][ 'parts' ][ 0 ][ 'text' ]; console . log ( content ); } Refer to the GenerateContentConfig in our API reference for a complete list of configurable parameters and their descriptions. Multimodal inputs The Gemini API supports multimodal inputs, allowing you to combine text with media files. The following example demonstrates providing an image: Python from PIL import Image from google import genai client = genai . Client () image = Image . open ( "/path/to/organ.png" ) response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = [ image , "Tell me about this instrument" ] ) print ( response . text ) JavaScript import { GoogleGenAI , createUserContent , createPartFromUri , } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const image = await ai . files . upload ({ file : "/path/to/organ.png" , }); const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : [ createUserContent ([ "Tell me about this instrument" , createPartFromUri ( image . uri , image . mimeType ), ]), ], }); console . log ( response . text ); } await main (); Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } imagePath := "/path/to/organ.jpg" imgData , _ := os . ReadFile ( imagePath ) parts := [] * genai . Part { genai . NewPartFromText ( "Tell me about this instrument" ), & genai . Part { InlineData : & genai . Blob { MIMEType : "image/jpeg" , Data : imgData , }, }, } contents := [] * genai . Content { genai . NewContentFromParts ( parts , genai . RoleUser ), } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , contents , nil , ) fmt . Println ( result . Text ()) } REST # Use a temporary file to hold the base64 encoded image data TEMP_B64 = $( mktemp ) trap 'rm -f "$TEMP_B64"' EXIT base64 $B64FLAGS $IMG_PATH > " $TEMP_B64 " # Use a temporary file to hold the JSON payload TEMP_JSON = $( mktemp ) trap 'rm -f "$TEMP_JSON"' EXIT cat > " $TEMP_JSON " << EOF { "contents" : [ { "parts" : [ { "text" : "Tell me about this instrument" } , { "inline_data" : { "mime_type" : "image/jpeg" , "data" : " $( cat " $TEMP_B64 " ) " } } ] } ] } EOF curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d "@ $TEMP_JSON " Apps Script // See https://developers.google.com/apps-script/guides/properties // for instructions on how to set the API key. const apiKey = PropertiesService . getScriptProperties (). getProperty ( 'GEMINI_API_KEY' ); function main () { const imageUrl = 'http://image/url' ; const image = getImageData ( imageUrl ); const payload = { contents : [ { parts : [ { image }, { text : 'Tell me about this instrument' }, ], }, ], }; const url = 'https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent' ; const options = { method : 'POST' , contentType : 'application/json' , headers : { 'x-goog-api-key' : apiKey , }, payload : JSON . stringify ( payload ) }; const response = UrlFetchApp . fetch ( url , options ); const data = JSON . parse ( response ); const content = data [ 'candidates' ][ 0 ][ 'content' ][ 'parts' ][ 0 ][ 'text' ]; console . log ( content ); } function getImageData ( url ) { const blob = UrlFetchApp . fetch ( url ). getBlob (); return { mimeType : blob . getContentType (), data : Utilities . base64Encode ( blob . getBytes ()) }; } For alternative methods of providing images and more advanced image processing, see our image understanding guide . The API also supports document , video , and audio inputs and understanding. Streaming responses By default, the model returns a response only after the entire generation process is complete. For more fluid interactions, use streaming to receive GenerateContentResponse instances incrementally as they're generated. Python from google import genai client = genai . Client () response = client . models . generate_content_stream ( model = "gemini-2.5-flash" , contents = [ "Explain how AI works" ] ) for chunk in response : print ( chunk . text , end = "" ) JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const response = await ai . models . generateContentStream ({ model : "gemini-2.5-flash" , contents : "Explain how AI works" , }); for await ( const chunk of response ) { console . log ( chunk . text ); } } await main (); Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } stream := client . Models . GenerateContentStream ( ctx , "gemini-2.5-flash" , genai . Text ( "Write a story about a magic backpack." ), nil , ) for chunk , _ := range stream { part := chunk . Candidates [ 0 ]. Content . Parts [ 0 ] fmt . Print ( part . Text ) } } REST curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:streamGenerateContent?alt=sse" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ --no-buffer \ -d '{ "contents": [ { "parts": [ { "text": "Explain how AI works" } ] } ] }' Apps Script // See https://developers.google.com/apps-script/guides/properties // for instructions on how to set the API key. const apiKey = PropertiesService . getScriptProperties (). getProperty ( 'GEMINI_API_KEY' ); function main () { const payload = { contents : [ { parts : [ { text : 'Explain how AI works' }, ], }, ], }; const url = 'https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:streamGenerateContent' ; const options = { method : 'POST' , contentType : 'application/json' , headers : { 'x-goog-api-key' : apiKey , }, payload : JSON . stringify ( payload ) }; const response = UrlFetchApp . fetch ( url , options ); const data = JSON . parse ( response ); const content = data [ 'candidates' ][ 0 ][ 'content' ][ 'parts' ][ 0 ][ 'text' ]; console . log ( content ); } Multi-turn conversations (Chat) Our SDKs provide functionality to collect multiple rounds of prompts and responses into a chat, giving you an easy way to keep track of the conversation history. Note: Chat functionality is only implemented as part of the SDKs. Behind the scenes, it still uses the generateContent API. For multi-turn conversations, the full conversation history is sent to the model with each follow-up turn. Python from google import genai client = genai . Client () chat = client . chats . create ( model = "gemini-2.5-flash" ) response = chat . send_message ( "I have 2 dogs in my house." ) print ( response . text ) response = chat . send_message ( "How many paws are in my house?" ) print ( response . text ) for message in chat . get_history (): print ( f 'role - { message . role } ' , end = ": " ) print ( message . parts [ 0 ] . text ) JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const chat = ai . chats . create ({ model : "gemini-2.5-flash" , history : [ { role : "user" , parts : [{ text : "Hello" }], }, { role : "model" , parts : [{ text : "Great to meet you. What would you like to know?" }], }, ], }); const response1 = await chat . sendMessage ({ message : "I have 2 dogs in my house." , }); console . log ( "Chat response 1:" , response1 . text ); const response2 = await chat . sendMessage ({ message : "How many paws are in my house?" , }); console . log ( "Chat response 2:" , response2 . text ); } await main (); Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } history := [] * genai . Content { genai . NewContentFromText ( "Hi nice to meet you! I have 2 dogs in my house." , genai . RoleUser ), genai . NewContentFromText ( "Great to meet you. What would you like to know?" , genai . RoleModel ), } chat , _ := client . Chats . Create ( ctx , "gemini-2.5-flash" , nil , history ) res , _ := chat . SendMessage ( ctx , genai . Part { Text : "How many paws are in my house?" }) if len ( res . Candidates ) > 0 { fmt . Println ( res . Candidates [ 0 ]. Content . Parts [ 0 ]. Text ) } } REST curl https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [ { "role": "user", "parts": [ { "text": "Hello" } ] }, { "role": "model", "parts": [ { "text": "Great to meet you. What would you like to know?" } ] }, { "role": "user", "parts": [ { "text": "I have two dogs in my house. How many paws are in my house?" } ] } ] }' Apps Script // See https://developers.google.com/apps-script/guides/properties // for instructions on how to set the API key. const apiKey = PropertiesService . getScriptProperties (). getProperty ( 'GEMINI_API_KEY' ); function main () { const payload = { contents : [ { role : 'user' , parts : [ { text : 'Hello' }, ], }, { role : 'model' , parts : [ { text : 'Great to meet you. What would you like to know?' }, ], }, { role : 'user' , parts : [ { text : 'I have two dogs in my house. How many paws are in my house?' }, ], }, ], }; const url = 'https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent' ; const options = { method : 'POST' , contentType : 'application/json' , headers : { 'x-goog-api-key' : apiKey , }, payload : JSON . stringify ( payload ) }; const response = UrlFetchApp . fetch ( url , options ); const data = JSON . parse ( response ); const content = data [ 'candidates' ][ 0 ][ 'content' ][ 'parts' ][ 0 ][ 'text' ]; console . log ( content ); } Streaming can also be used for multi-turn conversations. Python from google import genai client = genai . Client () chat = client . chats . create ( model = "gemini-2.5-flash" ) response = chat . send_message_stream ( "I have 2 dogs in my house." ) for chunk in response : print ( chunk . text , end = "" ) response = chat . send_message_stream ( "How many paws are in my house?" ) for chunk in response : print ( chunk . text , end = "" ) for message in chat . get_history (): print ( f 'role - { message . role } ' , end = ": " ) print ( message . parts [ 0 ] . text ) JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const chat = ai . chats . create ({ model : "gemini-2.5-flash" , history : [ { role : "user" , parts : [{ text : "Hello" }], }, { role : "model" , parts : [{ text : "Great to meet you. What would you like to know?" }], }, ], }); const stream1 = await chat . sendMessageStream ({ message : "I have 2 dogs in my house." , }); for await ( const chunk of stream1 ) { console . log ( chunk . text ); console . log ( "_" . repeat ( 80 )); } const stream2 = await chat . sendMessageStream ({ message : "How many paws are in my house?" , }); for await ( const chunk of stream2 ) { console . log ( chunk . text ); console . log ( "_" . repeat ( 80 )); } } await main (); Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } history := [] * genai . Content { genai . NewContentFromText ( "Hi nice to meet you! I have 2 dogs in my house." , genai . RoleUser ), genai . NewContentFromText ( "Great to meet you. What would you like to know?" , genai . RoleModel ), } chat , _ := client . Chats . Create ( ctx , "gemini-2.5-flash" , nil , history ) stream := chat . SendMessageStream ( ctx , genai . Part { Text : "How many paws are in my house?" }) for chunk , _ := range stream { part := chunk . Candidates [ 0 ]. Content . Parts [ 0 ] fmt . Print ( part . Text ) } } REST curl https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:streamGenerateContent?alt = sse \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [ { "role": "user", "parts": [ { "text": "Hello" } ] }, { "role": "model", "parts": [ { "text": "Great to meet you. What would you like to know?" } ] }, { "role": "user", "parts": [ { "text": "I have two dogs in my house. How many paws are in my house?" } ] } ] }' Apps Script // See https://developers.google.com/apps-script/guides/properties // for instructions on how to set the API key. const apiKey = PropertiesService . getScriptProperties (). getProperty ( 'GEMINI_API_KEY' ); function main () { const payload = { contents : [ { role : 'user' , parts : [ { text : 'Hello' }, ], }, { role : 'model' , parts : [ { text : 'Great to meet you. What would you like to know?' }, ], }, { role : 'user' , parts : [ { text : 'I have two dogs in my house. How many paws are in my house?' }, ], }, ], }; const url = 'https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:streamGenerateContent' ; const options = { method : 'POST' , contentType : 'application/json' , headers : { 'x-goog-api-key' : apiKey , }, payload : JSON . stringify ( payload ) }; const response = UrlFetchApp . fetch ( url , options ); const data = JSON . parse ( response ); const content = data [ 'candidates' ][ 0 ][ 'content' ][ 'parts' ][ 0 ][ 'text' ]; console . log ( content ); } Supported models All models in the Gemini family support text generation. To learn more about the models and their capabilities, visit the Models page. Best practices Prompting tips For basic text generation, a zero-shot prompt often suffices without needing examples, system instructions or specific formatting. For more tailored outputs: Use System instructions to guide the model. Provide few example inputs and outputs to guide the model. This is often referred to as few-shot prompting. Consult our prompt engineering guide for more tips. Structured output In some cases, you may need structured output, such as JSON. Refer to our structured output guide to learn how. What's next Try the Gemini API getting started Colab . Explore Gemini's image , video , audio and document understanding capabilities. Learn about multimodal file prompting strategies . Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-06-27 UTC. \ No newline at end of file diff --git a/text_content/docs_text-generation_ce5eab27.txt b/text_content/docs_text-generation_ce5eab27.txt new file mode 100644 index 0000000000000000000000000000000000000000..aa938aeea404f4f88a501839787c50e572179755 --- /dev/null +++ b/text_content/docs_text-generation_ce5eab27.txt @@ -0,0 +1,5 @@ +URL: https://ai.google.dev/gemini-api/docs/text-generation +Title: Text generation  |  Gemini API  |  Google AI for Developers +================================================== + +Text generation | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Text generation The Gemini API can generate text output from various inputs, including text, images, video, and audio, leveraging Gemini models. Here's a basic example that takes a single text input: Python from google import genai client = genai . Client () response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = "How does AI work?" ) print ( response . text ) JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : "How does AI work?" , }); console . log ( response . text ); } await main (); Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , genai . Text ( "Explain how AI works in a few words" ), nil , ) fmt . Println ( result . Text ()) } REST curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [ { "parts": [ { "text": "How does AI work?" } ] } ] }' Apps Script // See https://developers.google.com/apps-script/guides/properties // for instructions on how to set the API key. const apiKey = PropertiesService . getScriptProperties (). getProperty ( 'GEMINI_API_KEY' ); function main () { const payload = { contents : [ { parts : [ { text : 'How AI does work?' }, ], }, ], }; const url = 'https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent' ; const options = { method : 'POST' , contentType : 'application/json' , headers : { 'x-goog-api-key' : apiKey , }, payload : JSON . stringify ( payload ) }; const response = UrlFetchApp . fetch ( url , options ); const data = JSON . parse ( response ); const content = data [ 'candidates' ][ 0 ][ 'content' ][ 'parts' ][ 0 ][ 'text' ]; console . log ( content ); } Thinking with Gemini 2.5 2.5 Flash and Pro models have "thinking" enabled by default to enhance quality, which may take longer to run and increase token usage. When using 2.5 Flash, you can disable thinking by setting the thinking budget to zero. For more details, see the thinking guide . Python from google import genai from google.genai import types client = genai . Client () response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = "How does AI work?" , config = types . GenerateContentConfig ( thinking_config = types . ThinkingConfig ( thinking_budget = 0 ) # Disables thinking ), ) print ( response . text ) JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : "How does AI work?" , config : { thinkingConfig : { thinkingBudget : 0 , // Disables thinking }, } }); console . log ( response . text ); } await main (); Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , genai . Text ( "How does AI work?" ), & genai . GenerateContentConfig { ThinkingConfig : & genai . ThinkingConfig { ThinkingBudget : int32 ( 0 ), // Disables thinking }, } ) fmt . Println ( result . Text ()) } REST curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [ { "parts": [ { "text": "How does AI work?" } ] } ], "generationConfig": { "thinkingConfig": { "thinkingBudget": 0 } } }' Apps Script // See https://developers.google.com/apps-script/guides/properties // for instructions on how to set the API key. const apiKey = PropertiesService . getScriptProperties (). getProperty ( 'GEMINI_API_KEY' ); function main () { const payload = { contents : [ { parts : [ { text : 'How AI does work?' }, ], }, ], }; const url = 'https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent' ; const options = { method : 'POST' , contentType : 'application/json' , headers : { 'x-goog-api-key' : apiKey , }, payload : JSON . stringify ( payload ) }; const response = UrlFetchApp . fetch ( url , options ); const data = JSON . parse ( response ); const content = data [ 'candidates' ][ 0 ][ 'content' ][ 'parts' ][ 0 ][ 'text' ]; console . log ( content ); } System instructions and other configurations You can guide the behavior of Gemini models with system instructions. To do so, pass a GenerateContentConfig object. Python from google import genai from google.genai import types client = genai . Client () response = client . models . generate_content ( model = "gemini-2.5-flash" , config = types . GenerateContentConfig ( system_instruction = "You are a cat. Your name is Neko." ), contents = "Hello there" ) print ( response . text ) JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : "Hello there" , config : { systemInstruction : "You are a cat. Your name is Neko." , }, }); console . log ( response . text ); } await main (); Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } config := & genai . GenerateContentConfig { SystemInstruction : genai . NewContentFromText ( "You are a cat. Your name is Neko." , genai . RoleUser ), } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , genai . Text ( "Hello there" ), config , ) fmt . Println ( result . Text ()) } REST curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -d '{ "system_instruction": { "parts": [ { "text": "You are a cat. Your name is Neko." } ] }, "contents": [ { "parts": [ { "text": "Hello there" } ] } ] }' Apps Script // See https://developers.google.com/apps-script/guides/properties // for instructions on how to set the API key. const apiKey = PropertiesService . getScriptProperties (). getProperty ( 'GEMINI_API_KEY' ); function main () { const systemInstruction = { parts : [{ text : 'You are a cat. Your name is Neko.' }] }; const payload = { systemInstruction , contents : [ { parts : [ { text : 'Hello there' }, ], }, ], }; const url = 'https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent' ; const options = { method : 'POST' , contentType : 'application/json' , headers : { 'x-goog-api-key' : apiKey , }, payload : JSON . stringify ( payload ) }; const response = UrlFetchApp . fetch ( url , options ); const data = JSON . parse ( response ); const content = data [ 'candidates' ][ 0 ][ 'content' ][ 'parts' ][ 0 ][ 'text' ]; console . log ( content ); } The GenerateContentConfig object also lets you override default generation parameters, such as temperature . Python from google import genai from google.genai import types client = genai . Client () response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = [ "Explain how AI works" ], config = types . GenerateContentConfig ( temperature = 0.1 ) ) print ( response . text ) JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : "Explain how AI works" , config : { temperature : 0.1 , }, }); console . log ( response . text ); } await main (); Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } temp := float32 ( 0.9 ) topP := float32 ( 0.5 ) topK := float32 ( 20.0 ) config := & genai . GenerateContentConfig { Temperature : & temp , TopP : & topP , TopK : & topK , ResponseMIMEType : "application/json" , } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , genai . Text ( "What is the average size of a swallow?" ), config , ) fmt . Println ( result . Text ()) } REST curl https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [ { "parts": [ { "text": "Explain how AI works" } ] } ], "generationConfig": { "stopSequences": [ "Title" ], "temperature": 1.0, "topP": 0.8, "topK": 10 } }' Apps Script // See https://developers.google.com/apps-script/guides/properties // for instructions on how to set the API key. const apiKey = PropertiesService . getScriptProperties (). getProperty ( 'GEMINI_API_KEY' ); function main () { const generationConfig = { temperature : 1 , topP : 0.95 , topK : 40 , responseMimeType : 'text/plain' , }; const payload = { generationConfig , contents : [ { parts : [ { text : 'Explain how AI works in a few words' }, ], }, ], }; const url = 'https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent' ; const options = { method : 'POST' , contentType : 'application/json' , headers : { 'x-goog-api-key' : apiKey , }, payload : JSON . stringify ( payload ) }; const response = UrlFetchApp . fetch ( url , options ); const data = JSON . parse ( response ); const content = data [ 'candidates' ][ 0 ][ 'content' ][ 'parts' ][ 0 ][ 'text' ]; console . log ( content ); } Refer to the GenerateContentConfig in our API reference for a complete list of configurable parameters and their descriptions. Multimodal inputs The Gemini API supports multimodal inputs, allowing you to combine text with media files. The following example demonstrates providing an image: Python from PIL import Image from google import genai client = genai . Client () image = Image . open ( "/path/to/organ.png" ) response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = [ image , "Tell me about this instrument" ] ) print ( response . text ) JavaScript import { GoogleGenAI , createUserContent , createPartFromUri , } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const image = await ai . files . upload ({ file : "/path/to/organ.png" , }); const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : [ createUserContent ([ "Tell me about this instrument" , createPartFromUri ( image . uri , image . mimeType ), ]), ], }); console . log ( response . text ); } await main (); Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } imagePath := "/path/to/organ.jpg" imgData , _ := os . ReadFile ( imagePath ) parts := [] * genai . Part { genai . NewPartFromText ( "Tell me about this instrument" ), & genai . Part { InlineData : & genai . Blob { MIMEType : "image/jpeg" , Data : imgData , }, }, } contents := [] * genai . Content { genai . NewContentFromParts ( parts , genai . RoleUser ), } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , contents , nil , ) fmt . Println ( result . Text ()) } REST # Use a temporary file to hold the base64 encoded image data TEMP_B64 = $( mktemp ) trap 'rm -f "$TEMP_B64"' EXIT base64 $B64FLAGS $IMG_PATH > " $TEMP_B64 " # Use a temporary file to hold the JSON payload TEMP_JSON = $( mktemp ) trap 'rm -f "$TEMP_JSON"' EXIT cat > " $TEMP_JSON " << EOF { "contents" : [ { "parts" : [ { "text" : "Tell me about this instrument" } , { "inline_data" : { "mime_type" : "image/jpeg" , "data" : " $( cat " $TEMP_B64 " ) " } } ] } ] } EOF curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d "@ $TEMP_JSON " Apps Script // See https://developers.google.com/apps-script/guides/properties // for instructions on how to set the API key. const apiKey = PropertiesService . getScriptProperties (). getProperty ( 'GEMINI_API_KEY' ); function main () { const imageUrl = 'http://image/url' ; const image = getImageData ( imageUrl ); const payload = { contents : [ { parts : [ { image }, { text : 'Tell me about this instrument' }, ], }, ], }; const url = 'https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent' ; const options = { method : 'POST' , contentType : 'application/json' , headers : { 'x-goog-api-key' : apiKey , }, payload : JSON . stringify ( payload ) }; const response = UrlFetchApp . fetch ( url , options ); const data = JSON . parse ( response ); const content = data [ 'candidates' ][ 0 ][ 'content' ][ 'parts' ][ 0 ][ 'text' ]; console . log ( content ); } function getImageData ( url ) { const blob = UrlFetchApp . fetch ( url ). getBlob (); return { mimeType : blob . getContentType (), data : Utilities . base64Encode ( blob . getBytes ()) }; } For alternative methods of providing images and more advanced image processing, see our image understanding guide . The API also supports document , video , and audio inputs and understanding. Streaming responses By default, the model returns a response only after the entire generation process is complete. For more fluid interactions, use streaming to receive GenerateContentResponse instances incrementally as they're generated. Python from google import genai client = genai . Client () response = client . models . generate_content_stream ( model = "gemini-2.5-flash" , contents = [ "Explain how AI works" ] ) for chunk in response : print ( chunk . text , end = "" ) JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const response = await ai . models . generateContentStream ({ model : "gemini-2.5-flash" , contents : "Explain how AI works" , }); for await ( const chunk of response ) { console . log ( chunk . text ); } } await main (); Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } stream := client . Models . GenerateContentStream ( ctx , "gemini-2.5-flash" , genai . Text ( "Write a story about a magic backpack." ), nil , ) for chunk , _ := range stream { part := chunk . Candidates [ 0 ]. Content . Parts [ 0 ] fmt . Print ( part . Text ) } } REST curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:streamGenerateContent?alt=sse" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ --no-buffer \ -d '{ "contents": [ { "parts": [ { "text": "Explain how AI works" } ] } ] }' Apps Script // See https://developers.google.com/apps-script/guides/properties // for instructions on how to set the API key. const apiKey = PropertiesService . getScriptProperties (). getProperty ( 'GEMINI_API_KEY' ); function main () { const payload = { contents : [ { parts : [ { text : 'Explain how AI works' }, ], }, ], }; const url = 'https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:streamGenerateContent' ; const options = { method : 'POST' , contentType : 'application/json' , headers : { 'x-goog-api-key' : apiKey , }, payload : JSON . stringify ( payload ) }; const response = UrlFetchApp . fetch ( url , options ); const data = JSON . parse ( response ); const content = data [ 'candidates' ][ 0 ][ 'content' ][ 'parts' ][ 0 ][ 'text' ]; console . log ( content ); } Multi-turn conversations (Chat) Our SDKs provide functionality to collect multiple rounds of prompts and responses into a chat, giving you an easy way to keep track of the conversation history. Note: Chat functionality is only implemented as part of the SDKs. Behind the scenes, it still uses the generateContent API. For multi-turn conversations, the full conversation history is sent to the model with each follow-up turn. Python from google import genai client = genai . Client () chat = client . chats . create ( model = "gemini-2.5-flash" ) response = chat . send_message ( "I have 2 dogs in my house." ) print ( response . text ) response = chat . send_message ( "How many paws are in my house?" ) print ( response . text ) for message in chat . get_history (): print ( f 'role - { message . role } ' , end = ": " ) print ( message . parts [ 0 ] . text ) JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const chat = ai . chats . create ({ model : "gemini-2.5-flash" , history : [ { role : "user" , parts : [{ text : "Hello" }], }, { role : "model" , parts : [{ text : "Great to meet you. What would you like to know?" }], }, ], }); const response1 = await chat . sendMessage ({ message : "I have 2 dogs in my house." , }); console . log ( "Chat response 1:" , response1 . text ); const response2 = await chat . sendMessage ({ message : "How many paws are in my house?" , }); console . log ( "Chat response 2:" , response2 . text ); } await main (); Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } history := [] * genai . Content { genai . NewContentFromText ( "Hi nice to meet you! I have 2 dogs in my house." , genai . RoleUser ), genai . NewContentFromText ( "Great to meet you. What would you like to know?" , genai . RoleModel ), } chat , _ := client . Chats . Create ( ctx , "gemini-2.5-flash" , nil , history ) res , _ := chat . SendMessage ( ctx , genai . Part { Text : "How many paws are in my house?" }) if len ( res . Candidates ) > 0 { fmt . Println ( res . Candidates [ 0 ]. Content . Parts [ 0 ]. Text ) } } REST curl https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [ { "role": "user", "parts": [ { "text": "Hello" } ] }, { "role": "model", "parts": [ { "text": "Great to meet you. What would you like to know?" } ] }, { "role": "user", "parts": [ { "text": "I have two dogs in my house. How many paws are in my house?" } ] } ] }' Apps Script // See https://developers.google.com/apps-script/guides/properties // for instructions on how to set the API key. const apiKey = PropertiesService . getScriptProperties (). getProperty ( 'GEMINI_API_KEY' ); function main () { const payload = { contents : [ { role : 'user' , parts : [ { text : 'Hello' }, ], }, { role : 'model' , parts : [ { text : 'Great to meet you. What would you like to know?' }, ], }, { role : 'user' , parts : [ { text : 'I have two dogs in my house. How many paws are in my house?' }, ], }, ], }; const url = 'https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent' ; const options = { method : 'POST' , contentType : 'application/json' , headers : { 'x-goog-api-key' : apiKey , }, payload : JSON . stringify ( payload ) }; const response = UrlFetchApp . fetch ( url , options ); const data = JSON . parse ( response ); const content = data [ 'candidates' ][ 0 ][ 'content' ][ 'parts' ][ 0 ][ 'text' ]; console . log ( content ); } Streaming can also be used for multi-turn conversations. Python from google import genai client = genai . Client () chat = client . chats . create ( model = "gemini-2.5-flash" ) response = chat . send_message_stream ( "I have 2 dogs in my house." ) for chunk in response : print ( chunk . text , end = "" ) response = chat . send_message_stream ( "How many paws are in my house?" ) for chunk in response : print ( chunk . text , end = "" ) for message in chat . get_history (): print ( f 'role - { message . role } ' , end = ": " ) print ( message . parts [ 0 ] . text ) JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const chat = ai . chats . create ({ model : "gemini-2.5-flash" , history : [ { role : "user" , parts : [{ text : "Hello" }], }, { role : "model" , parts : [{ text : "Great to meet you. What would you like to know?" }], }, ], }); const stream1 = await chat . sendMessageStream ({ message : "I have 2 dogs in my house." , }); for await ( const chunk of stream1 ) { console . log ( chunk . text ); console . log ( "_" . repeat ( 80 )); } const stream2 = await chat . sendMessageStream ({ message : "How many paws are in my house?" , }); for await ( const chunk of stream2 ) { console . log ( chunk . text ); console . log ( "_" . repeat ( 80 )); } } await main (); Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } history := [] * genai . Content { genai . NewContentFromText ( "Hi nice to meet you! I have 2 dogs in my house." , genai . RoleUser ), genai . NewContentFromText ( "Great to meet you. What would you like to know?" , genai . RoleModel ), } chat , _ := client . Chats . Create ( ctx , "gemini-2.5-flash" , nil , history ) stream := chat . SendMessageStream ( ctx , genai . Part { Text : "How many paws are in my house?" }) for chunk , _ := range stream { part := chunk . Candidates [ 0 ]. Content . Parts [ 0 ] fmt . Print ( part . Text ) } } REST curl https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:streamGenerateContent?alt = sse \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [ { "role": "user", "parts": [ { "text": "Hello" } ] }, { "role": "model", "parts": [ { "text": "Great to meet you. What would you like to know?" } ] }, { "role": "user", "parts": [ { "text": "I have two dogs in my house. How many paws are in my house?" } ] } ] }' Apps Script // See https://developers.google.com/apps-script/guides/properties // for instructions on how to set the API key. const apiKey = PropertiesService . getScriptProperties (). getProperty ( 'GEMINI_API_KEY' ); function main () { const payload = { contents : [ { role : 'user' , parts : [ { text : 'Hello' }, ], }, { role : 'model' , parts : [ { text : 'Great to meet you. What would you like to know?' }, ], }, { role : 'user' , parts : [ { text : 'I have two dogs in my house. How many paws are in my house?' }, ], }, ], }; const url = 'https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:streamGenerateContent' ; const options = { method : 'POST' , contentType : 'application/json' , headers : { 'x-goog-api-key' : apiKey , }, payload : JSON . stringify ( payload ) }; const response = UrlFetchApp . fetch ( url , options ); const data = JSON . parse ( response ); const content = data [ 'candidates' ][ 0 ][ 'content' ][ 'parts' ][ 0 ][ 'text' ]; console . log ( content ); } Supported models All models in the Gemini family support text generation. To learn more about the models and their capabilities, visit the Models page. Best practices Prompting tips For basic text generation, a zero-shot prompt often suffices without needing examples, system instructions or specific formatting. For more tailored outputs: Use System instructions to guide the model. Provide few example inputs and outputs to guide the model. This is often referred to as few-shot prompting. Consult our prompt engineering guide for more tips. Structured output In some cases, you may need structured output, such as JSON. Refer to our structured output guide to learn how. What's next Try the Gemini API getting started Colab . Explore Gemini's image , video , audio and document understanding capabilities. Learn about multimodal file prompting strategies . Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-06-27 UTC. \ No newline at end of file diff --git a/text_content/docs_thinking_5dd41be4.txt b/text_content/docs_thinking_5dd41be4.txt new file mode 100644 index 0000000000000000000000000000000000000000..28d9aa55075fe3653fbaff44b1f1a0444ca53d0c --- /dev/null +++ b/text_content/docs_thinking_5dd41be4.txt @@ -0,0 +1,5 @@ +URL: https://ai.google.dev/gemini-api/docs/thinking#tasks +Title: Gemini thinking  |  Gemini API  |  Google AI for Developers +================================================== + +Gemini thinking | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Gemini thinking The Gemini 2.5 series models use an internal "thinking process" that significantly improves their reasoning and multi-step planning abilities, making them highly effective for complex tasks such as coding, advanced mathematics, and data analysis. This guide shows you how to work with Gemini's thinking capabilities using the Gemini API. Before you begin Ensure you use a supported 2.5 series model for thinking. You might find it beneficial to explore these models in AI Studio before diving into the API: Try Gemini 2.5 Flash in AI Studio Try Gemini 2.5 Pro in AI Studio Try Gemini 2.5 Flash-Lite Preview in AI Studio Generating content with thinking Initiating a request with a thinking model is similar to any other content generation request. The key difference lies in specifying one of the models with thinking support in the model field, as demonstrated in the following text generation example: Python from google import genai client = genai . Client () prompt = "Explain the concept of Occam's Razor and provide a simple, everyday example." response = client . models . generate_content ( model = "gemini-2.5-pro" , contents = prompt ) print ( response . text ) JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const prompt = "Explain the concept of Occam's Razor and provide a simple, everyday example." ; const response = await ai . models . generateContent ({ model : "gemini-2.5-pro" , contents : prompt , }); console . log ( response . text ); } main (); Go package main import ( "context" "fmt" "log" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } prompt := "Explain the concept of Occam's Razor and provide a simple, everyday example." model := "gemini-2.5-pro" resp , _ := client . Models . GenerateContent ( ctx , model , genai . Text ( prompt ), nil ) fmt . Println ( resp . Text ()) } REST curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-pro:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [ { "parts": [ { "text": "Explain the concept of Occam\' s Razor and provide a simple, everyday example. " } ] } ] }' ``` Thinking budgets The thinkingBudget parameter guides the model on the number of thinking tokens to use when generating a response. A higher token count generally allows for more detailed reasoning, which can be beneficial for tackling more complex tasks . If latency is more important, use a lower budget or disable thinking by setting thinkingBudget to 0. Setting the thinkingBudget to -1 turns on dynamic thinking , meaning the model will adjust the budget based on the complexity of the request. The thinkingBudget is only supported in Gemini 2.5 Flash, 2.5 Pro, and 2.5 Flash-Lite. Depending on the prompt, the model might overflow or underflow the token budget. The following are thinkingBudget configuration details for each model type. Model Default setting (Thinking budget is not set) Range Disable thinking Turn on dynamic thinking 2.5 Pro Dynamic thinking: Model decides when and how much to think 128 to 32768 N/A: Cannot disable thinking thinkingBudget = -1 2.5 Flash Dynamic thinking: Model decides when and how much to think 0 to 24576 thinkingBudget = 0 thinkingBudget = -1 2.5 Flash Lite Model does not think 512 to 24576 thinkingBudget = 0 thinkingBudget = -1 Python from google import genai from google.genai import types client = genai . Client () response = client . models . generate_content ( model = "gemini-2.5-pro" , contents = "Provide a list of 3 famous physicists and their key contributions" , config = types . GenerateContentConfig ( thinking_config = types . ThinkingConfig ( thinking_budget = 1024 ) # Turn off thinking: # thinking_config=types.ThinkingConfig(thinking_budget=0) # Turn on dynamic thinking: # thinking_config=types.ThinkingConfig(thinking_budget=-1) ), ) print ( response . text ) JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const response = await ai . models . generateContent ({ model : "gemini-2.5-pro" , contents : "Provide a list of 3 famous physicists and their key contributions" , config : { thinkingConfig : { thinkingBudget : 1024 , // Turn off thinking: // thinkingBudget: 0 // Turn on dynamic thinking: // thinkingBudget: -1 }, }, }); console . log ( response . text ); } main (); Go package main import ( "context" "fmt" "google.golang.org/genai" "os" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } thinkingBudgetVal := int32 ( 1024 ) contents := genai . Text ( "Provide a list of 3 famous physicists and their key contributions" ) model := "gemini-2.5-pro" resp , _ := client . Models . GenerateContent ( ctx , model , contents , & genai . GenerateContentConfig { ThinkingConfig : & genai . ThinkingConfig { ThinkingBudget : & thinkingBudgetVal , // Turn off thinking: // ThinkingBudget: int32(0), // Turn on dynamic thinking: // ThinkingBudget: int32(-1), }, }) fmt . Println ( resp . Text ()) } REST curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-pro:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [ { "parts": [ { "text": "Provide a list of 3 famous physicists and their key contributions" } ] } ], "generationConfig": { "thinkingConfig": { "thinkingBudget": 1024 # Thinking off: # "thinkingBudget": 0 # Turn on dynamic thinking: # "thinkingBudget": -1 } } }' Thought summaries Thought summaries are synthesized versions of the model's raw thoughts and offer insights into the model's internal reasoning process. Note that thinking budgets apply to the model's raw thoughts and not to thought summaries. You can enable thought summaries by setting includeThoughts to true in your request configuration. You can then access the summary by iterating through the response parameter's parts , and checking the thought boolean. Here's an example demonstrating how to enable and retrieve thought summaries without streaming, which returns a single, final thought summary with the response: Python from google import genai from google.genai import types client = genai . Client () prompt = "What is the sum of the first 50 prime numbers?" response = client . models . generate_content ( model = "gemini-2.5-pro" , contents = prompt , config = types . GenerateContentConfig ( thinking_config = types . ThinkingConfig ( include_thoughts = True ) ) ) for part in response . candidates [ 0 ] . content . parts : if not part . text : continue if part . thought : print ( "Thought summary:" ) print ( part . text ) print () else : print ( "Answer:" ) print ( part . text ) print () JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const response = await ai . models . generateContent ({ model : "gemini-2.5-pro" , contents : "What is the sum of the first 50 prime numbers?" , config : { thinkingConfig : { includeThoughts : true , }, }, }); for ( const part of response . candidates [ 0 ]. content . parts ) { if ( ! part . text ) { continue ; } else if ( part . thought ) { console . log ( "Thoughts summary:" ); console . log ( part . text ); } else { console . log ( "Answer:" ); console . log ( part . text ); } } } main (); Go package main import ( "context" "fmt" "google.golang.org/genai" "os" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } contents := genai . Text ( "What is the sum of the first 50 prime numbers?" ) model := "gemini-2.5-pro" resp , _ := client . Models . GenerateContent ( ctx , model , contents , & genai . GenerateContentConfig { ThinkingConfig : & genai . ThinkingConfig { IncludeThoughts : true , }, }) for _ , part := range resp . Candidates [ 0 ]. Content . Parts { if part . Text != "" { if part . Thought { fmt . Println ( "Thoughts Summary:" ) fmt . Println ( part . Text ) } else { fmt . Println ( "Answer:" ) fmt . Println ( part . Text ) } } } } And here is an example using thinking with streaming, which returns rolling, incremental summaries during generation: Python from google import genai from google.genai import types client = genai . Client () prompt = """ Alice, Bob, and Carol each live in a different house on the same street: red, green, and blue. The person who lives in the red house owns a cat. Bob does not live in the green house. Carol owns a dog. The green house is to the left of the red house. Alice does not own a cat. Who lives in each house, and what pet do they own? """ thoughts = "" answer = "" for chunk in client . models . generate_content_stream ( model = "gemini-2.5-pro" , contents = prompt , config = types . GenerateContentConfig ( thinking_config = types . ThinkingConfig ( include_thoughts = True ) ) ): for part in chunk . candidates [ 0 ] . content . parts : if not part . text : continue elif part . thought : if not thoughts : print ( "Thoughts summary:" ) print ( part . text ) thoughts += part . text else : if not answer : print ( "Answer:" ) print ( part . text ) answer += part . text JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({}); const prompt = `Alice, Bob, and Carol each live in a different house on the same street: red, green, and blue. The person who lives in the red house owns a cat. Bob does not live in the green house. Carol owns a dog. The green house is to the left of the red house. Alice does not own a cat. Who lives in each house, and what pet do they own?` ; let thoughts = "" ; let answer = "" ; async function main () { const response = await ai . models . generateContentStream ({ model : "gemini-2.5-pro" , contents : prompt , config : { thinkingConfig : { includeThoughts : true , }, }, }); for await ( const chunk of response ) { for ( const part of chunk . candidates [ 0 ]. content . parts ) { if ( ! part . text ) { continue ; } else if ( part . thought ) { if ( ! thoughts ) { console . log ( "Thoughts summary:" ); } console . log ( part . text ); thoughts = thoughts + part . text ; } else { if ( ! answer ) { console . log ( "Answer:" ); } console . log ( part . text ); answer = answer + part . text ; } } } } await main (); Go package main import ( "context" "fmt" "log" "os" "google.golang.org/genai" ) const prompt = ` Alice, Bob, and Carol each live in a different house on the same street: red, green, and blue. The person who lives in the red house owns a cat. Bob does not live in the green house. Carol owns a dog. The green house is to the left of the red house. Alice does not own a cat. Who lives in each house, and what pet do they own? ` func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } contents := genai . Text ( prompt ) model := "gemini-2.5-pro" resp := client . Models . GenerateContentStream ( ctx , model , contents , & genai . GenerateContentConfig { ThinkingConfig : & genai . ThinkingConfig { IncludeThoughts : true , }, }) for chunk := range resp { for _ , part := range chunk . Candidates [ 0 ]. Content . Parts { if len ( part . Text ) == 0 { continue } if part . Thought { fmt . Printf ( "Thought: %s\n" , part . Text ) } else { fmt . Printf ( "Answer: %s\n" , part . Text ) } } } } Thought signatures Because standard Gemini API text and content generation calls are stateless, when using thinking in multi-turn interactions (such as chat), the model doesn't have access to thought context from previous turns. You can maintain thought context using thought signatures, which are encrypted representations of the model's internal thought process. The model returns thought signatures in the response object when thinking and function calling are enabled. To ensure the model maintains context across multiple turns of a conversation, you must provide the thought signatures back to the model in the subsequent requests. You will receive thought signatures when: Thinking is enabled and thoughts are generated. The request includes function declarations . Note: Thought signatures are only available when you're using function calling, specifically, your request must include function declarations . You can find an example of thinking with function calls on the Function calling page. Other usage limitations to consider with function calling include: Signatures are returned from the model within other parts in the response, for example function calling or text parts. Return the entire response with all parts back to the model in subsequent turns. Don't concatenate parts with signatures together. Don't merge one part with a signature with another part without a signature. Pricing Note: Summaries are available in the free and paid tiers of the API. Thought signatures will increase the input tokens you are charged when sent back as part of the request. When thinking is turned on, response pricing is the sum of output tokens and thinking tokens. You can get the total number of generated thinking tokens from the thoughtsTokenCount field. Python # ... print ( "Thoughts tokens:" , response . usage_metadata . thoughts_token_count ) print ( "Output tokens:" , response . usage_metadata . candidates_token_count ) JavaScript // ... console . log ( `Thoughts tokens: ${ response . usageMetadata . thoughtsTokenCount } ` ); console . log ( `Output tokens: ${ response . usageMetadata . candidatesTokenCount } ` ); Go // ... usageMetadata , err := json . MarshalIndent ( response . UsageMetadata , "" , " " ) if err != nil { log . Fatal ( err ) } fmt . Println ( "Thoughts tokens:" , string ( usageMetadata . thoughts_token_count )) fmt . Println ( "Output tokens:" , string ( usageMetadata . candidates_token_count )) Thinking models generate full thoughts to improve the quality of the final response, and then output summaries to provide insight into the thought process. So, pricing is based on the full thought tokens the model needs to generate to create a summary, despite only the summary being output from the API. You can learn more about tokens in the Token counting guide. Supported models Thinking features are supported on all the 2.5 series models. You can find all model capabilities on the model overview page. Best practices This section includes some guidance for using thinking models efficiently. As always, following our prompting guidance and best practices will get you the best results. Debugging and steering Review reasoning : When you're not getting your expected response from the thinking models, it can help to carefully analyze Gemini's thought summaries. You can see how it broke down the task and arrived at its conclusion, and use that information to correct towards the right results. Provide Guidance in Reasoning : If you're hoping for a particularly lengthy output, you may want to provide guidance in your prompt to constrain the amount of thinking the model uses. This lets you reserve more of the token output for your response. Task complexity Easy Tasks (Thinking could be OFF): For straightforward requests where complex reasoning isn't required, such as fact retrieval or classification, thinking is not required. Examples include: "Where was DeepMind founded?" "Is this email asking for a meeting or just providing information?" Medium Tasks (Default/Some Thinking): Many common requests benefit from a degree of step-by-step processing or deeper understanding. Gemini can flexibly use thinking capability for tasks like: Analogize photosynthesis and growing up. Compare and contrast electric cars and hybrid cars. Hard Tasks (Maximum Thinking Capability): For truly complex challenges, such as solving complex math problems or coding tasks, we recommend setting a high thinking budget. These types of tasks require the model to engage its full reasoning and planning capabilities, often involving many internal steps before providing an answer. Examples include: Solve problem 1 in AIME 2025: Find the sum of all integer bases b > 9 for which 17 b is a divisor of 97 b . Write Python code for a web application that visualizes real-time stock market data, including user authentication. Make it as efficient as possible. Thinking with tools and capabilities Thinking models work with all of Gemini's tools and capabilities. This allows the models to interact with external systems, execute code, or access real-time information, incorporating the results into their reasoning and final response. The search tool allows the model to query Google Search to find up-to-date information or information beyond its training data. This is useful for questions about recent events or highly specific topics. The code execution tool enables the model to generate and run Python code to perform calculations, manipulate data, or solve problems that are best handled algorithmically. The model receives the code's output and can use it in its response. With structured output , you can constrain Gemini to respond with JSON. This is particularly useful for integrating the model's output into applications. Function calling connects the thinking model to external tools and APIs, so it can reason about when to call the right function and what parameters to provide. URL Context provides the model with URLs as additional context for your prompt. The model can then retrieve content from the URLs and use that content to inform and shape its response. You can try examples of using tools with thinking models in the Thinking cookbook . What's next? To work through more in depth examples, like: Using tools with thinking Streaming with thinking Adjusting the thinking budget for different results and more, try our Thinking cookbook . Thinking coverage is now available in our OpenAI Compatibility guide. For more info about Gemini 2.5 Pro, Gemini Flash 2.5, and Gemini 2.5 Flash-Lite, visit the model page . Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-11 UTC. \ No newline at end of file diff --git a/text_content/docs_usage-policies_64976133.txt b/text_content/docs_usage-policies_64976133.txt new file mode 100644 index 0000000000000000000000000000000000000000..d3cd5b08e6f45231115385249ee4859d22503ad3 --- /dev/null +++ b/text_content/docs_usage-policies_64976133.txt @@ -0,0 +1,5 @@ +URL: https://ai.google.dev/gemini-api/docs/usage-policies#main-content +Title: Additional usage policies  |  Gemini API  |  Google AI for Developers +================================================== + +Additional usage policies | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Additional usage policies This page includes additional usage policies for the Gemini API. Abuse monitoring Google is committed to the responsible development and use of AI. To ensure the safety and integrity of the Gemini API, we have created these policy guidelines. By using the Gemini API, you agree to the following guidelines, the Gemini API Additional Terms of Service and Generative AI Prohibited Use Policy . How We Monitor for Misuse Google's Trust and Safety Team employs a combination of automated and manual processes to detect potential misuse of the Gemini API and enforce our policies. Automated Detection: Automated systems scan API usage for violations of our Prohibited Use Policy, such as hate speech, harassment, sexually explicit content, and dangerous content. Manual Detection: If a project consistently exhibits suspicious activity, it may be flagged for manual review by authorized Google personnel. How We Handle Data To help with abuse monitoring, Google retains the following data for fifty-five (55) days: Prompts: The text prompts you submit to the API. Contextual Information: Any additional context you provide with your prompts. Output: The responses generated by the Gemini API. How We Investigate Potential Issues When prompts or model outputs are flagged by safety filters and abuse detection systems described above, authorized Google employees may assess the flagged content, and either confirm or correct the classification or determination based on predefined guidelines and policies. Data can be accessed for human review only by authorized Google employees via an internal governance assessment and review management platform. When data is logged for abuse monitoring, it is used solely for the purpose of policy enforcement and is not used to train or fine-tune any AI/ML models. Working with You on Policy Compliance If your use of Gemini doesn't align with our policies, we may take the following steps: Get in touch: We may reach out to you through email to understand your use case and explore ways to bring your usage into compliance. Temporary usage limits: We may limit your access to the Gemini API. Temporary suspension: We may temporarily pause your access to the Gemini API. Account closure: As a last resort, and for serious violations, we may permanently close your access to the Gemini API and other Google services. Scope These policy guidelines apply to the use of the Gemini API and AI Studio. Inline Preference Voting In Google AI Studio, you might occasionally see a side-by-side comparison of two different responses to your prompt. This is part of our Inline Preference Voting system. You'll be asked to choose which response you prefer. This helps us understand which model outputs users find most helpful. Why are we doing this? We're constantly working to improve our AI models and services. Your feedback through Inline Preference Voting helps us provide, improve, and develop Google products and services and machine learning technologies, including Google's enterprise features, products and services, consistent with the Gemini API Additional Terms of Service and Privacy Policy . What data is included in Feedback? To make informed decisions about our models, we collect certain data when you participate in Inline Preference Voting: Prompts and Responses: We record all prompts and responses, including any uploaded content, in the conversation you submitted feedback about. We also record the two response options that you selected from. This helps us understand the context of your preference. Your Vote: We record which response you preferred. This is the core of the feedback we're collecting. Usage Details: This includes information about which model generated the response and other technical and operational details about your usage of this feature. Your Privacy We take your privacy seriously. Google takes steps to protect your privacy as part of this process. This includes disconnecting this data from your Google Account, API key, and Cloud project before reviewers see or annotate it. Do not submit feedback on conversations that include sensitive, confidential, or personal information. Opting Out You'll have the option to skip the Inline Preference Voting when it appears. Thank you for helping us improve Google AI Studio! Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-03-24 UTC. \ No newline at end of file diff --git a/text_content/docs_video-understanding_3ec8ac8d.txt b/text_content/docs_video-understanding_3ec8ac8d.txt new file mode 100644 index 0000000000000000000000000000000000000000..99f57b7af8ef215a5e1b24a4b4761c0fac5f9c2c --- /dev/null +++ b/text_content/docs_video-understanding_3ec8ac8d.txt @@ -0,0 +1,5 @@ +URL: https://ai.google.dev/gemini-api/docs/video-understanding#upload-video +Title: Video understanding  |  Gemini API  |  Google AI for Developers +================================================== + +Video understanding | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Video understanding Gemini models can process videos, enabling many frontier developer use cases that would have historically required domain specific models. Some of Gemini's vision capabilities include the ability to: Describe, segment, and extract information from videos Answer questions about video content Refer to specific timestamps within a video Gemini was built to be multimodal from the ground up and we continue to push the frontier of what is possible. This guide shows how to use the Gemini API to generate text responses based on video inputs. Video input You can provide videos as input to Gemini in the following ways: Upload a video file using the File API before making a request to generateContent . Use this method for files larger than 20MB, videos longer than approximately 1 minute, or when you want to reuse the file across multiple requests. Pass inline video data with the request to generateContent . Use this method for smaller files (<20MB) and shorter durations. Include a YouTube URL directly in the prompt. Upload a video file You can use the Files API to upload a video file. Always use the Files API when the total request size (including the file, text prompt, system instructions, etc.) is larger than 20 MB, the video duration is significant, or if you intend to use the same video in multiple prompts. The File API accepts video file formats directly. This example uses the short NASA film "Jupiter's Great Red Spot Shrinks and Grows" . Credit: Goddard Space Flight Center (GSFC)/David Ladd (2018). "Jupiter's Great Red Spot Shrinks and Grows" is in the public domain and does not show identifiable people. ( NASA image and media usage guidelines. ) The following code downloads the sample video, uploads it using the File API, waits for it to be processed, and then uses the file reference in a generateContent request. Python from google import genai client = genai . Client () myfile = client . files . upload ( file = "path/to/sample.mp4" ) response = client . models . generate_content ( model = "gemini-2.0-flash" , contents = [ myfile , "Summarize this video. Then create a quiz with an answer key based on the information in this video." ] ) print ( response . text ) JavaScript import { GoogleGenAI , createUserContent , createPartFromUri , } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const myfile = await ai . files . upload ({ file : "path/to/sample.mp4" , config : { mimeType : "video/mp4" }, }); const response = await ai . models . generateContent ({ model : "gemini-2.0-flash" , contents : createUserContent ([ createPartFromUri ( myfile . uri , myfile . mimeType ), "Summarize this video. Then create a quiz with an answer key based on the information in this video." , ]), }); console . log ( response . text ); } await main (); Go uploadedFile , _ := client . Files . UploadFromPath ( ctx , "path/to/sample.mp4" , nil ) parts := [] * genai . Part { genai . NewPartFromText ( "Summarize this video. Then create a quiz with an answer key based on the information in this video." ), genai . NewPartFromURI ( uploadedFile . URI , uploadedFile . MIMEType ), } contents := [] * genai . Content { genai . NewContentFromParts ( parts , genai . RoleUser ), } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.0-flash" , contents , nil , ) fmt . Println ( result . Text ()) REST VIDEO_PATH = "path/to/sample.mp4" MIME_TYPE = $( file -b --mime-type " ${ VIDEO_PATH } " ) NUM_BYTES = $( wc -c < " ${ VIDEO_PATH } " ) DISPLAY_NAME = VIDEO tmp_header_file = upload-header.tmp echo "Starting file upload..." curl "https://generativelanguage.googleapis.com/upload/v1beta/files" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -D ${ tmp_header_file } \ -H "X-Goog-Upload-Protocol: resumable" \ -H "X-Goog-Upload-Command: start" \ -H "X-Goog-Upload-Header-Content-Length: ${ NUM_BYTES } " \ -H "X-Goog-Upload-Header-Content-Type: ${ MIME_TYPE } " \ -H "Content-Type: application/json" \ -d "{'file': {'display_name': ' ${ DISPLAY_NAME } '}}" 2 > /dev/null upload_url = $( grep -i "x-goog-upload-url: " " ${ tmp_header_file } " | cut -d " " -f2 | tr -d "\r" ) rm " ${ tmp_header_file } " echo "Uploading video data..." curl " ${ upload_url } " \ -H "Content-Length: ${ NUM_BYTES } " \ -H "X-Goog-Upload-Offset: 0" \ -H "X-Goog-Upload-Command: upload, finalize" \ --data-binary "@ ${ VIDEO_PATH } " 2 > /dev/null > file_info.json file_uri = $( jq -r ".file.uri" file_info.json ) echo file_uri = $file_uri echo "File uploaded successfully. File URI: ${ file_uri } " # --- 3. Generate content using the uploaded video file --- echo "Generating content from video..." curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [{ "parts":[ {"file_data":{"mime_type": "' " ${ MIME_TYPE } " '", "file_uri": "' " ${ file_uri } " '"}}, {"text": "Summarize this video. Then create a quiz with an answer key based on the information in this video."}] }] }' 2 > /dev/null > response.json jq -r ".candidates[].content.parts[].text" response.json To learn more about working with media files, see Files API . Pass video data inline Instead of uploading a video file using the File API, you can pass smaller videos directly in the request to generateContent . This is suitable for shorter videos under 20MB total request size. Here's an example of providing inline video data: Python # Only for videos of size <20Mb video_file_name = "/path/to/your/video.mp4" video_bytes = open ( video_file_name , 'rb' ) . read () response = client . models . generate_content ( model = 'models/gemini-2.0-flash' , contents = types . Content ( parts = [ types . Part ( inline_data = types . Blob ( data = video_bytes , mime_type = 'video/mp4' ) ), types . Part ( text = 'Please summarize the video in 3 sentences.' ) ] ) ) JavaScript import { GoogleGenAI } from "@google/genai" ; import * as fs from "node:fs" ; const ai = new GoogleGenAI ({}); const base64VideoFile = fs . readFileSync ( "path/to/small-sample.mp4" , { encoding : "base64" , }); const contents = [ { inlineData : { mimeType : "video/mp4" , data : base64VideoFile , }, }, { text : "Please summarize the video in 3 sentences." } ]; const response = await ai . models . generateContent ({ model : "gemini-2.0-flash" , contents : contents , }); console . log ( response . text ); REST Note: If you get an Argument list too long error, the base64 encoding of your file might be too long for the curl command line. Use the File API method instead for larger files. VIDEO_PATH = /path/to/your/video.mp4 if [[ " $( base64 --version 2>&1 ) " = * "FreeBSD" * ]] ; then B64FLAGS = "--input" else B64FLAGS = "-w0" fi curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [{ "parts":[ { "inline_data": { "mime_type":"video/mp4", "data": "' $( base64 $B64FLAGS $VIDEO_PATH ) '" } }, {"text": "Please summarize the video in 3 sentences."} ] }] }' 2 > /dev/null Include a YouTube URL Preview: The YouTube URL feature is in preview and is available at no charge. Pricing and rate limits are likely to change. The Gemini API and AI Studio support YouTube URLs as a file data Part . You can include a YouTube URL with a prompt asking the model to summarize, translate, or otherwise interact with the video content. Limitations: For the free tier, you can't upload more than 8 hours of YouTube video per day. For the paid tier, there is no limit based on video length. For models before 2.5, you can upload only 1 video per request. For models after 2.5, you can upload a maximum of 10 videos per request. You can only upload public videos (not private or unlisted videos). The following example shows how to include a YouTube URL with a prompt: Python response = client . models . generate_content ( model = 'models/gemini-2.0-flash' , contents = types . Content ( parts = [ types . Part ( file_data = types . FileData ( file_uri = 'https://www.youtube.com/watch?v=9hE5-98ZeCg' ) ), types . Part ( text = 'Please summarize the video in 3 sentences.' ) ] ) ) JavaScript import { GoogleGenerativeAI } from "@google/generative-ai" ; const genAI = new GoogleGenerativeAI ( process . env . GOOGLE_API_KEY ); const model = genAI . getGenerativeModel ({ model : "gemini-1.5-pro" }); const result = await model . generateContent ([ "Please summarize the video in 3 sentences." , { fileData : { fileUri : "https://www.youtube.com/watch?v=9hE5-98ZeCg" , }, }, ]); console . log ( result . response . text ()); Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } parts := [] * genai . Part { genai . NewPartFromText ( "Please summarize the video in 3 sentences." ), genai . NewPartFromURI ( "https://www.youtube.com/watch?v=9hE5-98ZeCg" , "video/mp4" ), } contents := [] * genai . Content { genai . NewContentFromParts ( parts , genai . RoleUser ), } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.0-flash" , contents , nil , ) fmt . Println ( result . Text ()) } REST curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [{ "parts":[ {"text": "Please summarize the video in 3 sentences."}, { "file_data": { "file_uri": "https://www.youtube.com/watch?v=9hE5-98ZeCg" } } ] }] }' 2 > /dev/null Refer to timestamps in the content You can ask questions about specific points in time within the video using timestamps of the form MM:SS . Python prompt = "What are the examples given at 00:05 and 00:10 supposed to show us?" # Adjusted timestamps for the NASA video JavaScript const prompt = "What are the examples given at 00:05 and 00:10 supposed to show us?" ; Go prompt := [] * genai . Part { genai . NewPartFromURI ( currentVideoFile . URI , currentVideoFile . MIMEType ), // Adjusted timestamps for the NASA video genai . NewPartFromText ( "What are the examples given at 00:05 and " + "00:10 supposed to show us?" ), } REST PROMPT = "What are the examples given at 00:05 and 00:10 supposed to show us?" Transcribe video and provide visual descriptions The Gemini models can transcribe and provide visual descriptions of video content by processing both the audio track and visual frames. For visual descriptions, the model samples the video at a rate of 1 frame per second . This sampling rate may affect the level of detail in the descriptions, particularly for videos with rapidly changing visuals. Python prompt = "Transcribe the audio from this video, giving timestamps for salient events in the video. Also provide visual descriptions." JavaScript const prompt = "Transcribe the audio from this video, giving timestamps for salient events in the video. Also provide visual descriptions." ; Go prompt := [] * genai . Part { genai . NewPartFromURI ( currentVideoFile . URI , currentVideoFile . MIMEType ), genai . NewPartFromText ( "Transcribe the audio from this video, giving timestamps for salient events in the video. Also " + "provide visual descriptions." ), } REST PROMPT = "Transcribe the audio from this video, giving timestamps for salient events in the video. Also provide visual descriptions." Customize video processing You can customize video processing in the Gemini API by setting clipping intervals or providing custom frame rate sampling. Tip: Video clipping and frames per second (FPS) are supported by all models, but the quality is significantly higher from 2.5 series models. Set clipping intervals You can clip video by specifying videoMetadata with start and end offsets. Python response = client . models . generate_content ( model = 'models/gemini-2.5-flash-preview-05-20' , contents = types . Content ( parts = [ types . Part ( file_data = types . FileData ( file_uri = 'https://www.youtube.com/watch?v=XEzRZ35urlk' ), video_metadata = types . VideoMetadata ( start_offset = '1250s' , end_offset = '1570s' ) ), types . Part ( text = 'Please summarize the video in 3 sentences.' ) ] ) ) Set a custom frame rate You can set custom frame rate sampling by passing an fps argument to videoMetadata . Python # Only for videos of size <20Mb video_file_name = "/path/to/your/video.mp4" video_bytes = open ( video_file_name , 'rb' ) . read () response = client . models . generate_content ( model = 'models/gemini-2.5-flash-preview-05-20' , contents = types . Content ( parts = [ types . Part ( inline_data = types . Blob ( data = video_bytes , mime_type = 'video/mp4' ), video_metadata = types . VideoMetadata ( fps = 5 ) ), types . Part ( text = 'Please summarize the video in 3 sentences.' ) ] ) ) By default 1 frame per second (FPS) is sampled from the video. You might want to set low FPS (< 1) for long videos. This is especially useful for mostly static videos (e.g. lectures). If you want to capture more details in rapidly changing visuals, consider setting a higher FPS value. Supported video formats Gemini supports the following video format MIME types: video/mp4 video/mpeg video/mov video/avi video/x-flv video/mpg video/webm video/wmv video/3gpp Technical details about videos Supported models & context : All Gemini 2.0 and 2.5 models can process video data. Models with a 2M context window can process videos up to 2 hours long at default media resolution or 6 hours long at low media resolution, while models with a 1M context window can process videos up to 1 hour long at default media resolution or 3 hours long at low media resolution. File API processing : When using the File API, videos are sampled at 1 frame per second (FPS) and audio is processed at 1Kbps (single channel). Timestamps are added every second. These rates are subject to change in the future for improvements in inference. Token calculation : Each second of video is tokenized as follows: Individual frames (sampled at 1 FPS): If mediaResolution is set to low, frames are tokenized at 66 tokens per frame. Otherwise, frames are tokenized at 258 tokens per frame. Audio: 32 tokens per second. Metadata is also included. Total: Approximately 300 tokens per second of video at default media resolution, or 100 tokens per second of video at low media resolution. Timestamp format : When referring to specific moments in a video within your prompt, use the MM:SS format (e.g., 01:15 for 1 minute and 15 seconds). Best practices : Use only one video per prompt request for optimal results. If combining text and a single video, place the text prompt after the video part in the contents array. Be aware that fast action sequences might lose detail due to the 1 FPS sampling rate. Consider slowing down such clips if necessary. What's next This guide shows how to upload video files and generate text outputs from video inputs. To learn more, see the following resources: System instructions : System instructions let you steer the behavior of the model based on your specific needs and use cases. Files API : Learn more about uploading and managing files for use with Gemini. File prompting strategies : The Gemini API supports prompting with text, image, audio, and video data, also known as multimodal prompting. Safety guidance : Sometimes generative AI models produce unexpected outputs, such as outputs that are inaccurate, biased, or offensive. Post-processing and human evaluation are essential to limit the risk of harm from such outputs. Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-06-27 UTC. \ No newline at end of file diff --git a/text_content/docs_video_b1ba8846.txt b/text_content/docs_video_b1ba8846.txt new file mode 100644 index 0000000000000000000000000000000000000000..73a08ec30dd33b6506f2dda5f7bdef4e2d7cbf24 --- /dev/null +++ b/text_content/docs_video_b1ba8846.txt @@ -0,0 +1,5 @@ +URL: https://ai.google.dev/gemini-api/docs/video#prompt-guide +Title: Generate video using Veo  |  Gemini API  |  Google AI for Developers +================================================== + +Generate video using Veo | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Generate video using Veo The Gemini API provides access to Veo 2 , Google's most capable video generation model to date. Veo generates videos in a wide range of cinematic and visual styles, capturing prompt nuance to render intricate details consistently across frames. This guide will help you get started with Veo using the Gemini API. For video prompting guidance, check out the Veo prompt guide section. Note: Veo is a paid feature and will not run in the Free tier. Visit the Pricing page for more details. Before you begin Before calling the Gemini API, ensure you have your SDK of choice installed, and a Gemini API key configured and ready to use. To use Veo with the Google Gen AI SDKs, ensure that you have one of the following versions installed: Python v1.10.0 or later TypeScript and JavaScript v0.8.0 or later Go v1.0.0 or later Generate videos This section provides code examples for generating videos using text prompts and using images . Generate from text You can use the following code to generate videos with Veo: Python import time from google import genai from google.genai import types client = genai . Client () operation = client . models . generate_videos ( model = "veo-2.0-generate-001" , prompt = "Panning wide shot of a calico kitten sleeping in the sunshine" , config = types . GenerateVideosConfig ( person_generation = "dont_allow" , # "dont_allow" or "allow_adult" aspect_ratio = "16:9" , # "16:9" or "9:16" ), ) while not operation . done : time . sleep ( 20 ) operation = client . operations . get ( operation ) for n , generated_video in enumerate ( operation . response . generated_videos ): client . files . download ( file = generated_video . video ) generated_video . video . save ( f "video { n } .mp4" ) # save the video JavaScript import { GoogleGenAI } from "@google/genai" ; import { createWriteStream } from "fs" ; import { Readable } from "stream" ; const ai = new GoogleGenAI ({}); async function main () { let operation = await ai . models . generateVideos ({ model : "veo-2.0-generate-001" , prompt : "Panning wide shot of a calico kitten sleeping in the sunshine" , config : { personGeneration : "dont_allow" , aspectRatio : "16:9" , }, }); while ( ! operation . done ) { await new Promise (( resolve ) = > setTimeout ( resolve , 10000 )); operation = await ai . operations . getVideosOperation ({ operation : operation , }); } operation . response ? . generatedVideos ? . forEach ( async ( generatedVideo , n ) = > { const resp = await fetch ( ` ${ generatedVideo . video ? . uri } & key=GEMINI_API_KEY` ); // append your API key const writer = createWriteStream ( `video ${ n } .mp4` ); Readable . fromWeb ( resp . body ). pipe ( writer ); }); } main (); Go package main import ( "context" "fmt" "os" "time" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } videoConfig := & genai . GenerateVideosConfig { AspectRatio : "16:9" , PersonGeneration : "dont_allow" , } operation , _ := client . Models . GenerateVideos ( ctx , "veo-2.0-generate-001" , "Panning wide shot of a calico kitten sleeping in the sunshine" , nil , videoConfig , ) for ! operation . Done { time . Sleep ( 20 * time . Second ) operation , _ = client . Operations . GetVideosOperation ( ctx , operation , nil ) } for n , video := range operation . Response . GeneratedVideos { client . Files . Download ( ctx , video . Video , nil ) fname := fmt . Sprintf ( "video_%d.mp4" , n ) _ = os . WriteFile ( fname , video . Video . VideoBytes , 0644 ) } } REST # Use curl to send a POST request to the predictLongRunning endpoint. # The request body includes the prompt for video generation. curl " ${ BASE_URL } /models/veo-2.0-generate-001:predictLongRunning" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H "Content-Type: application/json" \ -X "POST" \ -d '{ "instances": [{ "prompt": "Panning wide shot of a calico kitten sleeping in the sunshine" } ], "parameters": { "aspectRatio": "16:9", "personGeneration": "dont_allow", } }' | tee result.json | jq .name | sed 's/"//g' > op_name # Obtain operation name to download video. op_name = $( cat op_name ) # Check against status of operation. while true ; do is_done = $( curl -H "x-goog-api-key: $GEMINI_API_KEY " " ${ BASE_URL } / ${ op_name } " | tee op_check.json | jq .done ) if [ " ${ is_done } " = "true" ] ; then cat op_check.json echo "** Attach API_KEY to download video, or examine error message." break fi echo "** Video ${ op_name } has not downloaded yet! Check again after 5 seconds..." # Wait for 5 seoncds to check again. sleep 5 done This code takes about 2-3 minutes to run, though it may take longer if resources are constrained. Once it's done running, you should see a video that looks something like this: If you see an error message instead of a video, this means that resources are constrained and your request couldn't be completed. In this case, run the code again. Generated videos are stored on the server for 2 days, after which they are removed. If you want to save a local copy of your generated video, you must run result() and save() within 2 days of generation. Generate from images You can also generate videos using images. The following code generates an image using Imagen, then uses the generated image as the starting frame for the generated video. First, generate an image using Imagen : Python prompt = "Panning wide shot of a calico kitten sleeping in the sunshine" , imagen = client . models . generate_images ( model = "imagen-3.0-generate-002" , prompt = prompt , config = types . GenerateImagesConfig ( aspect_ratio = "16:9" , number_of_images = 1 ) ) imagen . generated_images [ 0 ] . image JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({}); const response = await ai . models . generateImages ({ model : "imagen-3.0-generate-002" , prompt : "Panning wide shot of a calico kitten sleeping in the sunshine" , config : { numberOfImages : 1 , }, }); // you'll pass response.generatedImages[0].image.imageBytes to Veo Go package main import ( "context" "fmt" "os" "time" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } config := & genai . GenerateImagesConfig { AspectRatio : "16:9" , NumberOfImages : 1 , } response , _ := client . Models . GenerateImages ( ctx , "imagen-3.0-generate-002" , "Panning wide shot of a calico kitten sleeping in the sunshine" , config , ) // you'll pass response.GeneratedImages[0].Image to Veo } Then, generate a video using the resulting image as the first frame: Python operation = client . models . generate_videos ( model = "veo-2.0-generate-001" , prompt = prompt , image = imagen . generated_images [ 0 ] . image , config = types . GenerateVideosConfig ( person_generation = "dont_allow" , # "dont_allow" or "allow_adult" aspect_ratio = "16:9" , # "16:9" or "9:16" number_of_videos = 2 ), ) # Wait for videos to generate while not operation . done : time . sleep ( 20 ) operation = client . operations . get ( operation ) for n , video in enumerate ( operation . response . generated_videos ): fname = f 'with_image_input { n } .mp4' print ( fname ) client . files . download ( file = video . video ) video . video . save ( fname ) JavaScript import { GoogleGenAI } from "@google/genai" ; import { createWriteStream } from "fs" ; import { Readable } from "stream" ; const ai = new GoogleGenAI ({}); async function main () { // get image bytes from Imagen, as shown above let operation = await ai . models . generateVideos ({ model : "veo-2.0-generate-001" , prompt : "Panning wide shot of a calico kitten sleeping in the sunshine" , image : { imageBytes : response . generatedImages [ 0 ]. image . imageBytes , // response from Imagen mimeType : "image/png" , }, config : { aspectRatio : "16:9" , numberOfVideos : 2 , }, }); while ( ! operation . done ) { await new Promise (( resolve ) = > setTimeout ( resolve , 10000 )); operation = await ai . operations . getVideosOperation ({ operation : operation , }); } operation . response ? . generatedVideos ? . forEach ( async ( generatedVideo , n ) = > { const resp = await fetch ( ` ${ generatedVideo . video ? . uri } & key=GEMINI_API_KEY` , // append your API key ); const writer = createWriteStream ( `video ${ n } .mp4` ); Readable . fromWeb ( resp . body ). pipe ( writer ); }); } main (); Go image := response . GeneratedImages [ 0 ]. Image videoConfig := & genai . GenerateVideosConfig { AspectRatio : "16:9" , NumberOfVideos : 2 , } operation , _ := client . Models . GenerateVideos ( ctx , "veo-2.0-generate-001" , "A dramatic scene based on the input image" , image , videoConfig , ) for ! operation . Done { time . Sleep ( 20 * time . Second ) operation , _ = client . Operations . GetVideosOperation ( ctx , operation , nil ) } for n , video := range operation . Response . GeneratedVideos { client . Files . Download ( ctx , video . Video , nil ) fname := fmt . Sprintf ( "video_with_image_input_%d.mp4" , n ) _ = os . WriteFile ( fname , video . Video . VideoBytes , 0644 ) } Veo model parameters (Naming conventions vary by programming language.) prompt : The text prompt for the video. When present, the image parameter is optional. image : The image to use as the first frame for the video. When present, the prompt parameter is optional. negativePrompt : Text string that describes anything you want to discourage the model from generating aspectRatio : Changes the aspect ratio of the generated video. Supported values are "16:9" and "9:16" . The default is "16:9" . personGeneration : Allow the model to generate videos of people. The following values are supported: Text-to-video generation: "dont_allow" : Don't allow the inclusion of people or faces. "allow_adult" : Generate videos that include adults, but not children. "allow_all" : Generate videos that include adults and children. Image-to-video generation: "dont_allow" : Don't allow the inclusion of people or faces. "allow_adult" : Generate videos that include adults, but not children. See Limitations . numberOfVideos : Output videos requested, either 1 or 2 . durationSeconds : Length of each output video in seconds, between 5 and 8 . enhance_prompt : Enable or disable the prompt rewriter. Enabled by default. Specifications Modalities Text-to-video generation Image-to-video generation Request latency Min: 11 seconds Max: 6 minutes (during peak hours) Variable length generation 5-8 seconds Resolution 720p Frame rate 24fps Aspect ratio 16:9 - landscape 9:16 - portrait Input languages (text-to-video) English Limitations Image-to-video personGeneration is not allowed in EU, UK, CH, MENA locations Text-to-video personGeneration: "allow_all" is not allowed in EU, UK, CH, MENA locations Note: Check out the Models , Pricing , and Rate limits pages for more usage limitations for Veo. Videos created by Veo are watermarked using SynthID , our tool for watermarking and identifying AI-generated content, and are passed through safety filters and memorization checking processes that help mitigate privacy, copyright and bias risks. Things to try To get the most out of Veo, incorporate video-specific terminology into your prompts. Veo understands a wide range of terms related to: Shot composition: Specify the framing and number of subjects in the shot (e.g., "single shot," "two shot," "over-the-shoulder shot"). Camera positioning and movement: Control the camera's location and movement using terms like "eye level," "high angle," "worms eye," "dolly shot," "zoom shot," "pan shot," and "tracking shot." Focus and lens effects: Use terms like "shallow focus," "deep focus," "soft focus," "macro lens," and "wide-angle lens" to achieve specific visual effects. Overall style and subject: Guide Veo's creative direction by specifying styles like "sci-fi," "romantic comedy," "action movie," or "animation." You can also describe the subjects and backgrounds you want, such as "cityscape," "nature," "vehicles," or "animals." Veo prompt guide This section of the Veo guide contains examples of videos you can create using Veo, and shows you how to modify prompts to produce distinct results. Safety filters Veo applies safety filters across Gemini to help ensure that generated videos and uploaded photos don't contain offensive content. Prompts that violate our terms and guidelines are blocked. Prompt writing basics Good prompts are descriptive and clear. To get your generated video as close as possible to what you want, start with identifying your core idea, and then refine your idea by adding keywords and modifiers. The following elements should be included in your prompt: Subject : The object, person, animal, or scenery that you want in your video. Context : The background or context in which the subject is placed. Action : What the subject is doing (for example, walking , running , or turning their head ). Style : This can be general or very specific. Consider using specific film style keywords, such as horror film , film noir , or animated styles like cartoon style. Camera motion : [Optional] What the camera is doing, such as aerial view , eye-level , top-down shot , or low-angle shot . Composition : [Optional] How the shot is framed, such as wide shot , close-up , or extreme close-up . Ambiance : [Optional] How the color and light contribute to the scene, such as blue tones , night , or warm tones . More tips for writing prompts The following tips help you write prompts that generate your videos: Use descriptive language : Use adjectives and adverbs to paint a clear picture for Veo. Provide context : If necessary, include background information to help your model understand what you want. Reference specific artistic styles : If you have a particular aesthetic in mind, reference specific artistic styles or art movements. Utilize prompt engineering tools : Consider exploring prompt engineering tools or resources to help you refine your prompts and achieve optimal results. For more information, visit Introduction to prompt design . Enhance the facial details in your personal and group images : Specify facial details as a focus of the photo like using the word portrait in the prompt. Example prompts and output This section presents several prompts, highlighting how descriptive details can elevate the outcome of each video. Icicles This video demonstrates how you can use the elements of prompt writing basics in your prompt. Prompt Generated output Close up shot (composition) of melting icicles (subject) on a frozen rock wall (context) with cool blue tones (ambiance), zoomed in (camera motion) maintaining close-up detail of water drips (action). Man on the phone These videos demonstrate how you can revise your prompt with increasingly specific details to get Veo to refine the output to your liking. Prompt Generated output Analysis The camera dollies to show a close up of a desperate man in a green trench coat. He's making a call on a rotary-style wall phone with a green neon light. It looks like a movie scene. This is the first generated video based on the prompt. A close-up cinematic shot follows a desperate man in a weathered green trench coat as he dials a rotary phone mounted on a gritty brick wall, bathed in the eerie glow of a green neon sign. The camera dollies in, revealing the tension in his jaw and the desperation etched on his face as he struggles to make the call. The shallow depth of field focuses on his furrowed brow and the black rotary phone, blurring the background into a sea of neon colors and indistinct shadows, creating a sense of urgency and isolation. A more detailed prompt results in a video that is more focused with a richer environment. A video with smooth motion that dollies in on a desperate man in a green trench coat, using a vintage rotary phone against a wall bathed in an eerie green neon glow. The camera starts from a medium distance, slowly moving closer to the man's face, revealing his frantic expression and the sweat on his brow as he urgently dials the phone. The focus is on the man's hands, his fingers fumbling with the dial as he desperately tries to connect. The green neon light casts long shadows on the wall, adding to the tense atmosphere. The scene is framed to emphasize the isolation and desperation of the man, highlighting the stark contrast between the vibrant glow of the neon and the man's grim determination. Adding more detail gives the subject a realistic expression and creates an intense and vibrant scene. Snow leopard This example demonstrates the output Veo might generate for a simple prompt. Prompt Generated output A cute creature with snow leopard-like fur is walking in winter forest, 3D cartoon style render. Running snow leopard This prompt has more detail and demonstrates generated output that might be closer to what you want in your video. Prompt Generated output Create a short 3D animated scene in a joyful cartoon style. A cute creature with snow leopard-like fur, large expressive eyes, and a friendly, rounded form happily prances through a whimsical winter forest. The scene should feature rounded, snow-covered trees, gentle falling snowflakes, and warm sunlight filtering through the branches. The creature's bouncy movements and wide smile should convey pure delight. Aim for an upbeat, heartwarming tone with bright, cheerful colors and playful animation. Examples by writing elements These examples show you how to refine your prompts by each basic element. Subject This example shows you how to specify a subject description. Subject description Prompt Generated output The description can include a subject, or multiple subjects and actions. Here, our subject is "white concrete apartment building." An architectural rendering of a white concrete apartment building with flowing organic shapes, seamlessly blending with lush greenery and futuristic elements Context This example shows you how to specify context. Context Prompt Generated output The background or context in which the subject will be placed is very important. Try placing your subject in a variety of backgrounds like on a busy street, or in outer space. A satellite floating through outer space with the moon and some stars in the background. Action This example shows you how to specify action. Action Prompt Generated output What is the subject doing like walking, running, or turning their head. A wide shot of a woman walking along the beach, looking content and relaxed towards the horizon at sunset. Style This example shows you how to specify style. Style Prompt Generated output You can add keywords to improve generation quality and steer it closer to intended style, such as shallow depth of field, movie still, minimalistic, surreal, vintage, futuristic, or double-exposure. Film noir style, man and woman walk on the street, mystery, cinematic, black and white. Camera motion This example shows you how to specify camera motion. Camera motion Prompt Generated output Options for camera motion include POV shot, aerial view, tracking drone view, or tracking shot. A POV shot from a vintage car driving in the rain, Canada at night, cinematic. Composition This example shows you how to specify composition. Composition Prompt Generated output How the shot is framed (wide shot, close-up, low angle). Extreme close-up of a an eye with city reflected in it. Create a video of a wide shot of surfer walking on a beach with a surfboard, beautiful sunset, cinematic. Ambiance This example shows you how to specify ambiance. Ambiance Prompt Generated output Color palettes play a vital role in photography, influencing the mood and conveying intended emotions. Try things like "muted orange warm tones," "natural light," "sunrise" or "sunset". For example, a warm, golden palette can infuse a romantic and atmospheric feel into a photograph. A close-up of a girl holding adorable golden retriever puppy in the park, sunlight. Cinematic close-up shot of a sad woman riding a bus in the rain, cool blue tones, sad mood. Use reference images to generate videos You can bring images to life by using Veo's image-to-video capability. You can use existing assets, or try Imagen to generate something new. Prompt Generated output Bunny with a chocolate candy bar. Bunny runs away. Negative prompts Negative prompts can be a powerful tool to help specify elements you don't want in the video. Describe what you want to discourage the model from generating after the phrase "Negative prompt". Follow these tips: ❌ Don't use instructive language or words like no or don't . For example, "No walls" or "don't show walls". ✅ Do describe what you don't want to see. For example, "wall, frame", which means that you don't want a wall or a frame in the video. Prompt Generated output Generate a short, stylized animation of a large, solitary oak tree with leaves blowing vigorously in a strong wind. The tree should have a slightly exaggerated, whimsical form, with dynamic, flowing branches. The leaves should display a variety of autumn colors, swirling and dancing in the wind. The animation should use a warm, inviting color palette. Generate a short, stylized animation of a large, solitary oak tree with leaves blowing vigorously in a strong wind. The tree should have a slightly exaggerated, whimsical form, with dynamic, flowing branches. The leaves should display a variety of autumn colors, swirling and dancing in the wind. The animation should use a warm, inviting color palette. With negative prompt - urban background, man-made structures, dark, stormy, or threatening atmosphere. Aspect ratios Gemini Veo video generation supports the following two aspect ratios: Aspect ratio Description Widescreen or 16:9 The most common aspect ratio for televisions, monitors, and mobile phone screens (landscape). Use this when you want to capture more of the background, like in scenic landscapes. Portrait or 9:16 Rotated widescreen. This aspect ratio has been popularized by short form video applications, such as Youtube shorts. Use this for portraits or tall objects with strong vertical orientations, such as buildings, trees, waterfall, or buildings. Widescreen This prompt is an example of the widescreen aspect ratio of 16:9. Prompt Generated output Create a video with a tracking drone view of a man driving a red convertible car in Palm Springs, 1970s, warm sunlight, long shadows. Portrait This prompt is an example of the portrait aspect ratio of 9:16. Prompt Generated output Create a video highlighting the smooth motion of a majestic Hawaiian waterfall within a lush rainforest. Focus on realistic water flow, detailed foliage, and natural lighting to convey tranquility. Capture the rushing water, misty atmosphere, and dappled sunlight filtering through the dense canopy. Use smooth, cinematic camera movements to showcase the waterfall and its surroundings. Aim for a peaceful, realistic tone, transporting the viewer to the serene beauty of the Hawaiian rainforest. What's next Gain more experience generating AI videos with the Veo Colab . Check out cool examples using Veo 2 on the Google DeepMind site Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-08 UTC. \ No newline at end of file diff --git a/text_content/docs_vision_88e7ce7a.txt b/text_content/docs_vision_88e7ce7a.txt new file mode 100644 index 0000000000000000000000000000000000000000..e365451ea25774b89d06c9dd3e26d1991f7340ec --- /dev/null +++ b/text_content/docs_vision_88e7ce7a.txt @@ -0,0 +1,5 @@ +URL: https://ai.google.dev/gemini-api/docs/vision#capabilities +Title: Image understanding  |  Gemini API  |  Google AI for Developers +================================================== + +Image understanding | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Image understanding Gemini models are built to be multimodal from the ground up, unlocking a wide range of image processing and computer vision tasks including but not limited to image captioning, classification, and visual question answering without having to train specialized ML models. Tip: In addition to their general multimodal capabilities, Gemini models (2.0 and newer) offer improved accuracy for specific use cases like object detection and segmentation , through additional training. See the Capabilities section for more details. Passing images to Gemini You can provide images as input to Gemini using two methods: Passing inline image data : Ideal for smaller files (total request size less than 20MB, including prompts). Uploading images using the File API : Recommended for larger files or for reusing images across multiple requests. Passing inline image data You can pass inline image data in the request to generateContent . You can provide image data as Base64 encoded strings or by reading local files directly (depending on the language). The following example shows how to read an image from a local file and pass it to generateContent API for processing. Python from google.genai import types with open ( 'path/to/small-sample.jpg' , 'rb' ) as f : image_bytes = f . read () response = client . models . generate_content ( model = 'gemini-2.5-flash' , contents = [ types . Part . from_bytes ( data = image_bytes , mime_type = 'image/jpeg' , ), 'Caption this image.' ] ) print ( response . text ) JavaScript import { GoogleGenAI } from "@google/genai" ; import * as fs from "node:fs" ; const ai = new GoogleGenAI ({}); const base64ImageFile = fs . readFileSync ( "path/to/small-sample.jpg" , { encoding : "base64" , }); const contents = [ { inlineData : { mimeType : "image/jpeg" , data : base64ImageFile , }, }, { text : "Caption this image." }, ]; const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : contents , }); console . log ( response . text ); Go bytes , _ := os . ReadFile ( "path/to/small-sample.jpg" ) parts := [] * genai . Part { genai . NewPartFromBytes ( bytes , "image/jpeg" ), genai . NewPartFromText ( "Caption this image." ), } contents := [] * genai . Content { genai . NewContentFromParts ( parts , genai . RoleUser ), } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , contents , nil , ) fmt . Println ( result . Text ()) REST IMG_PATH = "/path/to/your/image1.jpg" if [[ " $( base64 --version 2>&1 ) " = * "FreeBSD" * ]] ; then B64FLAGS = "--input" else B64FLAGS = "-w0" fi curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [{ "parts":[ { "inline_data": { "mime_type":"image/jpeg", "data": "' " $( base64 $B64FLAGS $IMG_PATH ) " '" } }, {"text": "Caption this image."}, ] }] }' 2 > /dev/null You can also fetch an image from a URL, convert it to bytes, and pass it to generateContent as shown in the following examples. Python from google import genai from google.genai import types import requests image_path = "https://goo.gle/instrument-img" image_bytes = requests . get ( image_path ) . content image = types . Part . from_bytes ( data = image_bytes , mime_type = "image/jpeg" ) client = genai . Client () response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = [ "What is this image?" , image ], ) print ( response . text ) JavaScript import { GoogleGenAI } from "@google/genai" ; async function main () { const ai = new GoogleGenAI ({}); const imageUrl = "https://goo.gle/instrument-img" ; const response = await fetch ( imageUrl ); const imageArrayBuffer = await response . arrayBuffer (); const base64ImageData = Buffer . from ( imageArrayBuffer ). toString ( 'base64' ); const result = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : [ { inlineData : { mimeType : 'image/jpeg' , data : base64ImageData , }, }, { text : "Caption this image." } ], }); console . log ( result . text ); } main (); Go package main import ( "context" "fmt" "os" "io" "net/http" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } // Download the image. imageResp , _ := http . Get ( "https://goo.gle/instrument-img" ) imageBytes , _ := io . ReadAll ( imageResp . Body ) parts := [] * genai . Part { genai . NewPartFromBytes ( imageBytes , "image/jpeg" ), genai . NewPartFromText ( "Caption this image." ), } contents := [] * genai . Content { genai . NewContentFromParts ( parts , genai . RoleUser ), } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , contents , nil , ) fmt . Println ( result . Text ()) } REST IMG_URL = "https://goo.gle/instrument-img" MIME_TYPE = $( curl -sIL " $IMG_URL " | grep -i '^content-type:' | awk -F ': ' '{print $2}' | sed 's/\r$//' | head -n 1 ) if [[ -z " $MIME_TYPE " || ! " $MIME_TYPE " == image/* ]] ; then MIME_TYPE = "image/jpeg" fi # Check for macOS if [[ " $( uname ) " == "Darwin" ]] ; then IMAGE_B64 = $( curl -sL " $IMG_URL " | base64 -b 0 ) elif [[ " $( base64 --version 2>&1 ) " = * "FreeBSD" * ]] ; then IMAGE_B64 = $( curl -sL " $IMG_URL " | base64 ) else IMAGE_B64 = $( curl -sL " $IMG_URL " | base64 -w0 ) fi curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [{ "parts":[ { "inline_data": { "mime_type":"' " $MIME_TYPE " '", "data": "' " $IMAGE_B64 " '" } }, {"text": "Caption this image."} ] }] }' 2 > /dev/null Note: Inline image data limits your total request size (text prompts, system instructions, and inline bytes) to 20MB. For larger requests, upload image files using the File API. Files API is also more efficient for scenarios that use the same image repeatedly. Uploading images using the File API For large files or to be able to use the same image file repeatedly, use the Files API. The following code uploads an image file and then uses the file in a call to generateContent . See the Files API guide for more information and examples. Python from google import genai client = genai . Client () my_file = client . files . upload ( file = "path/to/sample.jpg" ) response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = [ my_file , "Caption this image." ], ) print ( response . text ) JavaScript import { GoogleGenAI , createUserContent , createPartFromUri , } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const myfile = await ai . files . upload ({ file : "path/to/sample.jpg" , config : { mimeType : "image/jpeg" }, }); const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : createUserContent ([ createPartFromUri ( myfile . uri , myfile . mimeType ), "Caption this image." , ]), }); console . log ( response . text ); } await main (); Go package main import ( "context" "fmt" "os" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil ) if err != nil { log . Fatal ( err ) } uploadedFile , _ := client . Files . UploadFromPath ( ctx , "path/to/sample.jpg" , nil ) parts := [] * genai . Part { genai . NewPartFromText ( "Caption this image." ), genai . NewPartFromURI ( uploadedFile . URI , uploadedFile . MIMEType ), } contents := [] * genai . Content { genai . NewContentFromParts ( parts , genai . RoleUser ), } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , contents , nil , ) fmt . Println ( result . Text ()) } REST IMAGE_PATH = "path/to/sample.jpg" MIME_TYPE = $( file -b --mime-type " ${ IMAGE_PATH } " ) NUM_BYTES = $( wc -c < " ${ IMAGE_PATH } " ) DISPLAY_NAME = IMAGE tmp_header_file = upload-header.tmp # Initial resumable request defining metadata. # The upload url is in the response headers dump them to a file. curl "https://generativelanguage.googleapis.com/upload/v1beta/files" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -D upload-header.tmp \ -H "X-Goog-Upload-Protocol: resumable" \ -H "X-Goog-Upload-Command: start" \ -H "X-Goog-Upload-Header-Content-Length: ${ NUM_BYTES } " \ -H "X-Goog-Upload-Header-Content-Type: ${ MIME_TYPE } " \ -H "Content-Type: application/json" \ -d "{'file': {'display_name': ' ${ DISPLAY_NAME } '}}" 2 > /dev/null upload_url = $( grep -i "x-goog-upload-url: " " ${ tmp_header_file } " | cut -d " " -f2 | tr -d "\r" ) rm " ${ tmp_header_file } " # Upload the actual bytes. curl " ${ upload_url } " \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H "Content-Length: ${ NUM_BYTES } " \ -H "X-Goog-Upload-Offset: 0" \ -H "X-Goog-Upload-Command: upload, finalize" \ --data-binary "@ ${ IMAGE_PATH } " 2 > /dev/null > file_info.json file_uri = $( jq -r ".file.uri" file_info.json ) echo file_uri = $file_uri # Now generate content using that file curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [{ "parts":[ {"file_data":{"mime_type": "' " ${ MIME_TYPE } " '", "file_uri": "' " ${ file_uri } " '"}}, {"text": "Caption this image."}] }] }' 2 > /dev/null > response.json cat response.json echo jq ".candidates[].content.parts[].text" response.json Prompting with multiple images You can provide multiple images in a single prompt by including multiple image Part objects in the contents array. These can be a mix of inline data (local files or URLs) and File API references. Python from google import genai from google.genai import types client = genai . Client () # Upload the first image image1_path = "path/to/image1.jpg" uploaded_file = client . files . upload ( file = image1_path ) # Prepare the second image as inline data image2_path = "path/to/image2.png" with open ( image2_path , 'rb' ) as f : img2_bytes = f . read () # Create the prompt with text and multiple images response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = [ "What is different between these two images?" , uploaded_file , # Use the uploaded file reference types . Part . from_bytes ( data = img2_bytes , mime_type = 'image/png' ) ] ) print ( response . text ) JavaScript import { GoogleGenAI , createUserContent , createPartFromUri , } from "@google/genai" ; import * as fs from "node:fs" ; const ai = new GoogleGenAI ({}); async function main () { // Upload the first image const image1_path = "path/to/image1.jpg" ; const uploadedFile = await ai . files . upload ({ file : image1_path , config : { mimeType : "image/jpeg" }, }); // Prepare the second image as inline data const image2_path = "path/to/image2.png" ; const base64Image2File = fs . readFileSync ( image2_path , { encoding : "base64" , }); // Create the prompt with text and multiple images const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : createUserContent ([ "What is different between these two images?" , createPartFromUri ( uploadedFile . uri , uploadedFile . mimeType ), { inlineData : { mimeType : "image/png" , data : base64Image2File , }, }, ]), }); console . log ( response . text ); } await main (); Go // Upload the first image image1Path := "path/to/image1.jpg" uploadedFile , _ := client . Files . UploadFromPath ( ctx , image1Path , nil ) // Prepare the second image as inline data image2Path := "path/to/image2.jpeg" imgBytes , _ := os . ReadFile ( image2Path ) parts := [] * genai . Part { genai . NewPartFromText ( "What is different between these two images?" ), genai . NewPartFromBytes ( imgBytes , "image/jpeg" ), genai . NewPartFromURI ( uploadedFile . URI , uploadedFile . MIMEType ), } contents := [] * genai . Content { genai . NewContentFromParts ( parts , genai . RoleUser ), } result , _ := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , contents , nil , ) fmt . Println ( result . Text ()) REST # Upload the first image IMAGE1_PATH = "path/to/image1.jpg" MIME1_TYPE = $( file -b --mime-type " ${ IMAGE1_PATH } " ) NUM1_BYTES = $( wc -c < " ${ IMAGE1_PATH } " ) DISPLAY_NAME1 = IMAGE1 tmp_header_file1 = upload-header1.tmp curl "https://generativelanguage.googleapis.com/upload/v1beta/files" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -D upload-header1.tmp \ -H "X-Goog-Upload-Protocol: resumable" \ -H "X-Goog-Upload-Command: start" \ -H "X-Goog-Upload-Header-Content-Length: ${ NUM1_BYTES } " \ -H "X-Goog-Upload-Header-Content-Type: ${ MIME1_TYPE } " \ -H "Content-Type: application/json" \ -d "{'file': {'display_name': ' ${ DISPLAY_NAME1 } '}}" 2 > /dev/null upload_url1 = $( grep -i "x-goog-upload-url: " " ${ tmp_header_file1 } " | cut -d " " -f2 | tr -d "\r" ) rm " ${ tmp_header_file1 } " curl " ${ upload_url1 } " \ -H "Content-Length: ${ NUM1_BYTES } " \ -H "X-Goog-Upload-Offset: 0" \ -H "X-Goog-Upload-Command: upload, finalize" \ --data-binary "@ ${ IMAGE1_PATH } " 2 > /dev/null > file_info1.json file1_uri = $( jq ".file.uri" file_info1.json ) echo file1_uri = $file1_uri # Prepare the second image (inline) IMAGE2_PATH = "path/to/image2.png" MIME2_TYPE = $( file -b --mime-type " ${ IMAGE2_PATH } " ) if [[ " $( base64 --version 2>&1 ) " = * "FreeBSD" * ]] ; then B64FLAGS = "--input" else B64FLAGS = "-w0" fi IMAGE2_BASE64 = $( base64 $B64FLAGS $IMAGE2_PATH ) # Now generate content using both images curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [{ "parts":[ {"text": "What is different between these two images?"}, {"file_data":{"mime_type": "' " ${ MIME1_TYPE } " '", "file_uri": ' $file1_uri '}}, { "inline_data": { "mime_type":"' " ${ MIME2_TYPE } " '", "data": "' " $IMAGE2_BASE64 " '" } } ] }] }' 2 > /dev/null > response.json cat response.json echo jq ".candidates[].content.parts[].text" response.json Object detection From Gemini 2.0 onwards, models are further trained to detect objects in an image and get their bounding box coordinates. The coordinates, relative to image dimensions, scale to [0, 1000]. You need to descale these coordinates based on your original image size. Python from google import genai from google.genai import types from PIL import Image import json client = genai . Client () prompt = "Detect the all of the prominent items in the image. The box_2d should be [ymin, xmin, ymax, xmax] normalized to 0-1000." image = Image . open ( "/path/to/image.png" ) config = types . GenerateContentConfig ( response_mime_type = "application/json" ) response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = [ image , prompt ], config = config ) width , height = image . size bounding_boxes = json . loads ( response . text ) converted_bounding_boxes = [] for bounding_box in bounding_boxes : abs_y1 = int ( bounding_box [ "box_2d" ][ 0 ] / 1000 * height ) abs_x1 = int ( bounding_box [ "box_2d" ][ 1 ] / 1000 * width ) abs_y2 = int ( bounding_box [ "box_2d" ][ 2 ] / 1000 * height ) abs_x2 = int ( bounding_box [ "box_2d" ][ 3 ] / 1000 * width ) converted_bounding_boxes . append ([ abs_x1 , abs_y1 , abs_x2 , abs_y2 ]) print ( "Image size: " , width , height ) print ( "Bounding boxes:" , converted_bounding_boxes ) Note: The model also supports generating bounding boxes based on custom instructions, such as: "Show bounding boxes of all green objects in this image". It also support custom labels like "label the items with the allergens they can contain". For more examples, check following notebooks in the Gemini Cookbook : 2D spatial understanding notebook Experimental 3D pointing notebook Segmentation Starting with Gemini 2.5, models not only detect items but also segment them and provide their contour masks. The model predicts a JSON list, where each item represents a segmentation mask. Each item has a bounding box (" box_2d ") in the format [y0, x0, y1, x1] with normalized coordinates between 0 and 1000, a label (" label ") that identifies the object, and finally the segmentation mask inside the bounding box, as base64 encoded png that is a probability map with values between 0 and 255. The mask needs to be resized to match the bounding box dimensions, then binarized at your confidence threshold (127 for the midpoint). Note: For better results, disable thinking by setting the thinking budget to 0. See code sample below for an example. Python from google import genai from google.genai import types from PIL import Image , ImageDraw import io import base64 import json import numpy as np import os client = genai . Client () def parse_json ( json_output : str ): # Parsing out the markdown fencing lines = json_output . splitlines () for i , line in enumerate ( lines ): if line == "```json" : json_output = " \n " . join ( lines [ i + 1 :]) # Remove everything before "```json" output = json_output . split ( "```" )[ 0 ] # Remove everything after the closing "```" break # Exit the loop once "```json" is found return json_output def extract_segmentation_masks ( image_path : str , output_dir : str = "segmentation_outputs" ): # Load and resize image im = Image . open ( image_path ) im . thumbnail ([ 1024 , 1024 ], Image . Resampling . LANCZOS ) prompt = """ Give the segmentation masks for the wooden and glass items. Output a JSON list of segmentation masks where each entry contains the 2D bounding box in the key "box_2d", the segmentation mask in key "mask", and the text label in the key "label". Use descriptive labels. """ config = types . GenerateContentConfig ( thinking_config = types . ThinkingConfig ( thinking_budget = 0 ) # set thinking_budget to 0 for better results in object detection ) response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = [ prompt , im ], # Pillow images can be directly passed as inputs (which will be converted by the SDK) config = config ) # Parse JSON response items = json . loads ( parse_json ( response . text )) # Create output directory os . makedirs ( output_dir , exist_ok = True ) # Process each mask for i , item in enumerate ( items ): # Get bounding box coordinates box = item [ "box_2d" ] y0 = int ( box [ 0 ] / 1000 * im . size [ 1 ]) x0 = int ( box [ 1 ] / 1000 * im . size [ 0 ]) y1 = int ( box [ 2 ] / 1000 * im . size [ 1 ]) x1 = int ( box [ 3 ] / 1000 * im . size [ 0 ]) # Skip invalid boxes if y0 > = y1 or x0 > = x1 : continue # Process mask png_str = item [ "mask" ] if not png_str . startswith ( "data:image/png;base64," ): continue # Remove prefix png_str = png_str . removeprefix ( "data:image/png;base64," ) mask_data = base64 . b64decode ( png_str ) mask = Image . open ( io . BytesIO ( mask_data )) # Resize mask to match bounding box mask = mask . resize (( x1 - x0 , y1 - y0 ), Image . Resampling . BILINEAR ) # Convert mask to numpy array for processing mask_array = np . array ( mask ) # Create overlay for this mask overlay = Image . new ( 'RGBA' , im . size , ( 0 , 0 , 0 , 0 )) overlay_draw = ImageDraw . Draw ( overlay ) # Create overlay for the mask color = ( 255 , 255 , 255 , 200 ) for y in range ( y0 , y1 ): for x in range ( x0 , x1 ): if mask_array [ y - y0 , x - x0 ] > 128 : # Threshold for mask overlay_draw . point (( x , y ), fill = color ) # Save individual mask and its overlay mask_filename = f " { item [ 'label' ] } _ { i } _mask.png" overlay_filename = f " { item [ 'label' ] } _ { i } _overlay.png" mask . save ( os . path . join ( output_dir , mask_filename )) # Create and save overlay composite = Image . alpha_composite ( im . convert ( 'RGBA' ), overlay ) composite . save ( os . path . join ( output_dir , overlay_filename )) print ( f "Saved mask and overlay for { item [ 'label' ] } to { output_dir } " ) # Example usage if __name__ == "__main__" : extract_segmentation_masks ( "path/to/image.png" ) Check the segmentation example in the cookbook guide for a more detailed example. An example segmentation output with objects and segmentation masks Supported image formats Gemini supports the following image format MIME types: PNG - image/png JPEG - image/jpeg WEBP - image/webp HEIC - image/heic HEIF - image/heif Capabilities All Gemini model versions are multimodal and can be utilized in a wide range of image processing and computer vision tasks including but not limited to image captioning, visual question and answering, image classification, object detection and segmentation. Gemini can reduce the need to use specialized ML models depending on your quality and performance requirements. Some later model versions are specifically trained improve accuracy of specialized tasks in addition to generic capabilities: Gemini 2.0 models are further trained to support enhanced object detection . Gemini 2.5 models are further trained to support enhanced segmentation in addition to object detection . Limitations and key technical information File limit Gemini 2.5 Pro/Flash, 2.0 Flash, 1.5 Pro, and 1.5 Flash support a maximum of 3,600 image files per request. Token calculation Gemini 1.5 Flash and Gemini 1.5 Pro : 258 tokens if both dimensions <= 384 pixels. Larger images are tiled (min tile 256px, max 768px, resized to 768x768), with each tile costing 258 tokens. Gemini 2.0 Flash and Gemini 2.5 Flash/Pro : 258 tokens if both dimensions <= 384 pixels. Larger images are tiled into 768x768 pixel tiles, each costing 258 tokens. Tips and best practices Verify that images are correctly rotated. Use clear, non-blurry images. When using a single image with text, place the text prompt after the image part in the contents array. What's next This guide shows you how to upload image files and generate text outputs from image inputs. To learn more, see the following resources: Files API : Learn more about uploading and managing files for use with Gemini. System instructions : System instructions let you steer the behavior of the model based on your specific needs and use cases. File prompting strategies : The Gemini API supports prompting with text, image, audio, and video data, also known as multimodal prompting. Safety guidance : Sometimes generative AI models produce unexpected outputs, such as outputs that are inaccurate, biased, or offensive. Post-processing and human evaluation are essential to limit the risk of harm from such outputs. Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-08 UTC. \ No newline at end of file diff --git a/text_content/gemini-api_docs_00fe02b2.txt b/text_content/gemini-api_docs_00fe02b2.txt new file mode 100644 index 0000000000000000000000000000000000000000..dc7df827c590493f59d20c5a5e78d26c973c274e --- /dev/null +++ b/text_content/gemini-api_docs_00fe02b2.txt @@ -0,0 +1,5 @@ +URL: https://ai.google.dev/gemini-api/docs +Title: Gemini API  |  Google AI for Developers +================================================== + +Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Gemini Developer API Get a Gemini API Key Get a Gemini API key and make your first API request in minutes. Python from google import genai client = genai . Client () response = client . models . generate_content ( model = "gemini-2.5-flash" , contents = "Explain how AI works in a few words" , ) print ( response . text ) JavaScript import { GoogleGenAI } from "@google/genai" ; const ai = new GoogleGenAI ({}); async function main () { const response = await ai . models . generateContent ({ model : "gemini-2.5-flash" , contents : "Explain how AI works in a few words" , }); console . log ( response . text ); } await main (); Go package main import ( "context" "fmt" "log" "google.golang.org/genai" ) func main () { ctx := context . Background () client , err := genai . NewClient ( ctx , nil )) if err != nil { log . Fatal ( err ) } result , err := client . Models . GenerateContent ( ctx , "gemini-2.5-flash" , genai . Text ( "Explain how AI works in a few words" ), nil , ) if err != nil { log . Fatal ( err ) } fmt . Println ( result . Text ()) } Java package com.example ; import com.google.genai.Client ; import com.google.genai.types.GenerateContentResponse ; public class GenerateTextFromTextInput { public static void main ( String [] args ) { Client client = new Client (); GenerateContentResponse response = client . models . generateContent ( "gemini-2.5-flash" , "Explain how AI works in a few words" , null ); System . out . println ( response . text ()); } } REST curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent" \ -H "x-goog-api-key: $GEMINI_API_KEY " \ -H 'Content-Type: application/json' \ -X POST \ -d '{ "contents": [ { "parts": [ { "text": "Explain how AI works in a few words" } ] } ] }' Meet the models Use Gemini in Google AI Studio 2.5 Pro spark Our most powerful thinking model with features for complex reasoning and much more 2.5 Flash spark Our newest multimodal model, with next generation features and improved capabilities 2.5 Flash-Lite spark Our fastest and most cost-efficient multimodal model with great performance for high-frequency tasks Explore the API Native Image Generation Generate and edit highly contextual images natively with Gemini 2.0 Flash. Explore long context Input millions of tokens to Gemini models and derive understanding from unstructured images, videos, and documents. Generate structured outputs Constrain Gemini to respond with JSON, a structured data format suitable for automated processing. Start building with the Gemini API Get started Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-06-27 UTC. \ No newline at end of file diff --git a/text_content/models_experimental-models_1712d6ff.txt b/text_content/models_experimental-models_1712d6ff.txt new file mode 100644 index 0000000000000000000000000000000000000000..61d81cdddbc8a5888f60d2778ae9f54b13075569 --- /dev/null +++ b/text_content/models_experimental-models_1712d6ff.txt @@ -0,0 +1,5 @@ +URL: https://ai.google.dev/gemini-api/docs/models/experimental-models#main-content +Title: Gemini models  |  Gemini API  |  Google AI for Developers +================================================== + +Gemini models | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Gemini models 2.5 Pro spark Our most powerful thinking model with maximum response accuracy and state-of-the-art performance Input audio, images, video, and text, get text responses Tackle difficult problems, analyze large databases, and more Best for complex coding, reasoning, and multimodal understanding 2.5 Flash spark Our best model in terms of price-performance, offering well-rounded capabilities. Input audio, images, video, and text, and get text responses Model thinks as needed; or, you can configure a thinking budget Best for low latency, high volume tasks that require thinking 2.5 Flash-Lite experiment A Gemini 2.5 Flash model optimized for cost efficiency and low latency. Input audio, images, video, and text, and get text responses Most cost-efficient model supporting high throughput Best for real time, low latency use cases Note: Gemini 2.5 Pro and 2.5 Flash come with thinking on by default . If you're migrating from a non-thinking model such as 2.0 Pro or Flash, we recommend you to review the Thinking guide first. Model variants The Gemini API offers different models that are optimized for specific use cases. Here's a brief overview of Gemini variants that are available: Model variant Input(s) Output Optimized for Gemini 2.5 Pro gemini-2.5-pro Audio, images, videos, text, and PDF Text Enhanced thinking and reasoning, multimodal understanding, advanced coding, and more Gemini 2.5 Flash gemini-2.5-flash Audio, images, videos, and text Text Adaptive thinking, cost efficiency Gemini 2.5 Flash-Lite Preview gemini-2.5-flash-lite-preview-06-17 Text, image, video, audio Text Most cost-efficient model supporting high throughput Gemini 2.5 Flash Native Audio gemini-2.5-flash-preview-native-audio-dialog & gemini-2.5-flash-exp-native-audio-thinking-dialog Audio, videos, and text Text and audio, interleaved High quality, natural conversational audio outputs, with or without thinking Gemini 2.5 Flash Preview TTS gemini-2.5-flash-preview-tts Text Audio Low latency, controllable, single- and multi-speaker text-to-speech audio generation Gemini 2.5 Pro Preview TTS gemini-2.5-pro-preview-tts Text Audio Low latency, controllable, single- and multi-speaker text-to-speech audio generation Gemini 2.0 Flash gemini-2.0-flash Audio, images, videos, and text Text Next generation features, speed, and realtime streaming. Gemini 2.0 Flash Preview Image Generation gemini-2.0-flash-preview-image-generation Audio, images, videos, and text Text, images Conversational image generation and editing Gemini 2.0 Flash-Lite gemini-2.0-flash-lite Audio, images, videos, and text Text Cost efficiency and low latency Gemini 1.5 Flash gemini-1.5-flash Audio, images, videos, and text Text Fast and versatile performance across a diverse variety of tasks Gemini 1.5 Flash-8B gemini-1.5-flash-8b Audio, images, videos, and text Text High volume and lower intelligence tasks Gemini 1.5 Pro gemini-1.5-pro Audio, images, videos, and text Text Complex reasoning tasks requiring more intelligence Gemini Embedding gemini-embedding-exp Text Text embeddings Measuring the relatedness of text strings Imagen 4 imagen-4.0-generate-preview-06-06 imagen-4.0-ultra-generate-preview-06-06 Text Images Our most up-to-date image generation model Imagen 3 imagen-3.0-generate-002 Text Images High quality image generation model Veo 2 veo-2.0-generate-001 Text, images Video High quality video generation Gemini 2.5 Flash Live gemini-live-2.5-flash-preview Audio, video, and text Text, audio Low-latency bidirectional voice and video interactions Gemini 2.0 Flash Live gemini-2.0-flash-live-001 Audio, video, and text Text, audio Low-latency bidirectional voice and video interactions You can view the rate limits for each model on the rate limits page . Gemini 2.5 Pro Gemini 2.5 Pro is our state-of-the-art thinking model, capable of reasoning over complex problems in code, math, and STEM, as well as analyzing large datasets, codebases, and documents using long context. Try in Google AI Studio Model details Property Description id_card Model code gemini-2.5-pro save Supported data types Inputs Audio, images, video, text, and PDF Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 65,536 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Search grounding Supported Image generation Not supported Audio generation Not supported Live API Not supported Thinking Supported Batch API Supported 123 Versions Read the model version patterns for more details. Stable: gemini-2.5-pro Preview: gemini-2.5-pro-preview-06-05 Preview: gemini-2.5-pro-preview-05-06 Preview: gemini-2.5-pro-preview-03-25 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Our best model in terms of price-performance, offering well-rounded capabilities. 2.5 Flash is best for large scale processing, low-latency, high volume tasks that require thinking, and agentic use cases. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash save Supported data types Inputs Text, images, video, audio Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 65,536 handyman Capabilities Audio generation Not supported Caching Supported Code execution Supported Function calling Supported Image generation Not supported Search grounding Supported Structured outputs Supported Thinking Supported Tuning Not supported Batch API Supported 123 Versions Read the model version patterns for more details. Stable: gemini-2.5-flash Preview: gemini-2.5-flash-preview-05-20 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash-Lite Preview A Gemini 2.5 Flash model optimized for cost efficiency and low latency. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-lite-preview-06-17 save Supported data types Inputs Text, images, video, and audio Output Text token_auto Token limits [*] Input token limit 1,000,000 Output token limit 64,000 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported URL Context Supported Search grounding Supported Image generation Not supported Audio generation Not supported Live API Not supported Thinking Supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.5-flash-lite-preview-06-17 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Native Audio Our native audio dialog models, with and without thinking, available through the Live API . These models provide interactive and unstructured conversational experiences, with style and control prompting. Try native audio in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-preview-native-audio-dialog & models/gemini-2.5-flash-exp-native-audio-thinking-dialog save Supported data types Inputs Audio, video, text Output Audio and text token_auto Token limits [*] Input token limit 128,000 Output token limit 8,000 handyman Capabilities Audio generation Supported Caching Not supported Code execution Not supported Function calling Supported Image generation Not supported Search grounding Supported Structured outputs Not supported Thinking Supported Tuning Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.5-flash-preview-05-20 Experimental: gemini-2.5-flash-exp-native-audio-thinking-dialog calendar_month Latest update May 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Preview Text-to-Speech Gemini 2.5 Flash Preview TTS is our price-performant text-to-speech model, delivering high control and transparency for structured workflows like podcast generation, audiobooks, customer support, and more. Gemini 2.5 Flash rate limits are more restricted since it is an experimental / preview model. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-preview-tts save Supported data types Inputs Text Output Audio token_auto Token limits [*] Input token limit 8,000 Output token limit 16,000 handyman Capabilities Structured outputs Not supported Caching Not supported Tuning Not supported Function calling Not supported Code execution Not supported Search Not supported Audio generation Supported Live API Not supported Thinking Not supported 123 Versions Read the model version patterns for more details. gemini-2.5-flash-preview-tts calendar_month Latest update May 2025 Gemini 2.5 Pro Preview Text-to-Speech Gemini 2.5 Pro Preview TTS is our most powerful text-to-speech model, delivering high control and transparency for structured workflows like podcast generation, audiobooks, customer support, and more. Gemini 2.5 Pro rate limits are more restricted since it is an experimental / preview model. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-pro-preview-tts save Supported data types Inputs Text Output Audio token_auto Token limits [*] Input token limit 8,000 Output token limit 16,000 handyman Capabilities Structured outputs Not supported Caching Not supported Tuning Not supported Function calling Not supported Code execution Not supported Search Not supported Audio generation Supported Live API Not supported Thinking Not supported 123 Versions Read the model version patterns for more details. gemini-2.5-pro-preview-tts calendar_month Latest update May 2025 Gemini 2.0 Flash Gemini 2.0 Flash delivers next-gen features and improved capabilities, including superior speed, native tool use, and a 1M token context window. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Not supported Live API Supported Thinking Experimental Batch API Supported 123 Versions Read the model version patterns for more details. Latest: gemini-2.0-flash Stable: gemini-2.0-flash-001 Experimental: gemini-2.0-flash-exp calendar_month Latest update February 2025 cognition_2 Knowledge cutoff August 2024 Gemini 2.0 Flash Preview Image Generation Gemini 2.0 Flash Preview Image Generation delivers improved image generation features, including generating and editing images conversationally. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-preview-image-generation save Supported data types Inputs Audio, images, video, and text Output Text and images token_auto Token limits [*] Input token limit 32,000 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Not supported Code execution Not Supported Search Not Supported Image generation Supported Audio generation Not supported Live API Not Supported Thinking Not Supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.0-flash-preview-image-generation gemini-2.0-flash-preview-image-generation is not currently supported in a number of countries in Europe, Middle East & Africa calendar_month Latest update May 2025 cognition_2 Knowledge cutoff August 2024 Gemini 2.0 Flash-Lite A Gemini 2.0 Flash model optimized for cost efficiency and low latency. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-lite save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Not supported Search Not supported Image generation Not supported Audio generation Not supported Live API Not supported Batch API Supported 123 Versions Read the model version patterns for more details. Latest: gemini-2.0-flash-lite Stable: gemini-2.0-flash-lite-001 calendar_month Latest update February 2025 cognition_2 Knowledge cutoff August 2024 Gemini 1.5 Flash Gemini 1.5 Flash is a fast and versatile multimodal model for scaling across diverse tasks. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-flash save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 3,600 Maximum video length 1 hour Maximum audio length Approximately 9.5 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-flash-latest Latest stable: gemini-1.5-flash Stable: gemini-1.5-flash-001 gemini-1.5-flash-002 calendar_month Latest update September 2024 Gemini 1.5 Flash-8B Gemini 1.5 Flash-8B is a small model designed for lower intelligence tasks. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-flash-8b save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 3,600 Maximum video length 1 hour Maximum audio length Approximately 9.5 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-flash-8b-latest Latest stable: gemini-1.5-flash-8b Stable: gemini-1.5-flash-8b-001 calendar_month Latest update October 2024 Gemini 1.5 Pro Try Gemini 2.5 Pro Preview , our most advanced Gemini model to date. Gemini 1.5 Pro is a mid-size multimodal model that is optimized for a wide-range of reasoning tasks. 1.5 Pro can process large amounts of data at once, including 2 hours of video, 19 hours of audio, codebases with 60,000 lines of code, or 2,000 pages of text. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-pro save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 2,097,152 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 7,200 Maximum video length 2 hours Maximum audio length Approximately 19 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-pro-latest Latest stable: gemini-1.5-pro Stable: gemini-1.5-pro-001 gemini-1.5-pro-002 calendar_month Latest update September 2024 Imagen 4 Imagen 4 is our latest image model, capable of generating highly detailed images with rich lighting, significantly better text rendering, and higher resolution output than previous models. Model details Property Description id_card Model code Gemini API imagen-4.0-generate-preview-06-06 imagen-4.0-ultra-generate-preview-06-06 save Supported data types Input Text Output Images token_auto Token limits [*] Input token limit 480 tokens (text) Output images 1 (Ultra) 1 to 4 (Standard) calendar_month Latest update June 2025 Imagen 3 Imagen 3 is our highest quality text-to-image model, capable of generating images with even better detail, richer lighting and fewer distracting artifacts than our previous models. Model details Property Description id_card Model code Gemini API imagen-3.0-generate-002 save Supported data types Input Text Output Images token_auto Token limits [*] Input token limit N/A Output images Up to 4 calendar_month Latest update February 2025 Veo 2 Veo 2 is our high quality text- and image-to-video model, capable of generating detailed videos, capturing the artistic nuance in your prompts. Model details Property Description id_card Model code Gemini API veo-2.0-generate-001 save Supported data types Input Text, image Output Video token_auto Limits Text input N/A Image input Any image resolution and aspect ratio up to 20MB file size Output video Up to 2 calendar_month Latest update April 2025 Gemini 2.5 Flash Live The Gemini 2.5 Flash Live model works with the Live API to enable low-latency bidirectional voice and video interactions with Gemini. The model can process text, audio, and video input, and it can provide text and audio output. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-live-2.5-flash-preview save Supported data types Inputs Audio, video, and text Output Text, and audio token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Supported Thinking Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-live-2.5-flash-preview calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.0 Flash Live The Gemini 2.0 Flash Live model works with the Live API to enable low-latency bidirectional voice and video interactions with Gemini. The model can process text, audio, and video input, and it can provide text and audio output. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-live-001 save Supported data types Inputs Audio, video, and text Output Text, and audio token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Supported Thinking Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.0-flash-live-001 calendar_month Latest update April 2025 cognition_2 Knowledge cutoff August 2024 Gemini Embedding Experimental Gemini embedding achieves a SOTA performance across many key dimensions including code, multi-lingual, and retrieval. Gemini Embedding rate limits are more restricted since it is an experimental model. Model details Property Description id_card Model code Gemini API gemini-embedding-exp-03-07 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 8,192 Output dimension size Elastic, supports: 3072, 1536, or 768 calendar_month Latest update March 2025 Text Embedding and Embedding Text Embedding Try our new experimental Gemini embedding model which achieves state-of-the-art performance. Text embeddings are used to measure the relatedness of strings and are widely used in many AI applications. text-embedding-004 achieves a stronger retrieval performance and outperforms existing models with comparable dimensions, on the standard MTEB embedding benchmarks. Model details Property Description id_card Model code Gemini API models/text-embedding-004 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 2,048 Output dimension size 768 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Not supported calendar_month Latest update April 2024 Embedding Note: Text Embedding is the newer version of the Embedding model. If you're creating a new project, use Text Embedding. You can use the Embedding model to generate text embeddings for input text. The Embedding model is optimized for creating embeddings with 768 dimensions for text of up to 2,048 tokens. Embedding model details Property Description id_card Model code models/embedding-001 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 2,048 Output dimension size 768 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Not supported calendar_month Latest update December 2023 AQA You can use the AQA model to perform Attributed Question-Answering (AQA)–related tasks over a document, corpus, or a set of passages. The AQA model returns answers to questions that are grounded in provided sources, along with estimating answerable probability. Model details Property Description id_card Model code models/aqa save Supported data types Input Text Output Text language Supported language English token_auto Token limits [*] Input token limit 7,168 Output token limit 1,024 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Supported calendar_month Latest update December 2023 See the examples to explore the capabilities of these model variations. [*] A token is equivalent to about 4 characters for Gemini models. 100 tokens are about 60-80 English words. Model version name patterns Gemini models are available in either stable , preview , or experimental versions. In your code, you can use one of the following model name formats to specify which model and version you want to use. Latest stable Points to the most recent stable version released for the specified model generation and variation. To specify the latest stable version, use the following pattern: -- . For example, gemini-2.0-flash . Stable Points to a specific stable model. Stable models usually don't change. Most production apps should use a specific stable model. To specify a stable version, use the following pattern: --- . For example, gemini-2.0-flash-001 . Preview Points to a preview model which may not be suitable for production use, come with more restrictive rate limits, but may have billing enabled. To specify a preview version, use the following pattern: --- . For example, gemini-2.5-pro-preview-06-05 . Experimental Points to an experimental model which may not be suitable for production use and come with more restrictive rate limits. We release experimental models to gather feedback and get our latest updates into the hands of developers quickly. To specify an experimental version, use the following pattern: --- . For example, gemini-2.0-pro-exp-02-05 . Experimental models In addition to stable models, the Gemini API offers experimental models which may not be suitable for production use and come with more restrictive rate limits. We release experimental models to gather feedback, get our latest updates into the hands of developers quickly, and highlight the pace of innovation happening at Google. What we learn from experimental launches informs how we release models more widely. An experimental model can be swapped for another without prior notice. We don't guarantee that an experimental model will become a stable model in the future. Previous experimental models As new versions or stable releases become available, we remove and replace experimental models. You can find the previous experimental models we released in the following section along with the replacement version: Model code Base model Replacement version gemini-2.5-flash-preview-04-17 Gemini 2.5 Flash gemini-2.5-flash-preview-05-20 gemini-2.0-flash-exp-image-generation Gemini 2.0 Flash gemini-2.0-flash-preview-image-generation gemini-2.5-pro-preview-06-05 Gemini 2.5 Pro gemini-2.5-pro gemini-2.5-pro-preview-05-06 Gemini 2.5 Pro gemini-2.5-pro gemini-2.5-pro-preview-03-25 Gemini 2.5 Pro gemini-2.5-pro gemini-2.0-flash-thinking-exp-01-21 Gemini 2.5 Flash gemini-2.5-flash-preview-04-17 gemini-2.0-pro-exp-02-05 Gemini 2.0 Pro Experimental gemini-2.5-pro-preview-03-25 gemini-2.0-flash-exp Gemini 2.0 Flash gemini-2.0-flash gemini-exp-1206 Gemini 2.0 Pro gemini-2.0-pro-exp-02-05 gemini-2.0-flash-thinking-exp-1219 Gemini 2.0 Flash Thinking gemini-2.0-flash-thinking-exp-01-21 gemini-exp-1121 Gemini gemini-exp-1206 gemini-exp-1114 Gemini gemini-exp-1206 gemini-1.5-pro-exp-0827 Gemini 1.5 Pro gemini-exp-1206 gemini-1.5-pro-exp-0801 Gemini 1.5 Pro gemini-exp-1206 gemini-1.5-flash-8b-exp-0924 Gemini 1.5 Flash-8B gemini-1.5-flash-8b gemini-1.5-flash-8b-exp-0827 Gemini 1.5 Flash-8B gemini-1.5-flash-8b Supported languages Gemini models are trained to work with the following languages: Arabic ( ar ) Bengali ( bn ) Bulgarian ( bg ) Chinese simplified and traditional ( zh ) Croatian ( hr ) Czech ( cs ) Danish ( da ) Dutch ( nl ) English ( en ) Estonian ( et ) Finnish ( fi ) French ( fr ) German ( de ) Greek ( el ) Hebrew ( iw ) Hindi ( hi ) Hungarian ( hu ) Indonesian ( id ) Italian ( it ) Japanese ( ja ) Korean ( ko ) Latvian ( lv ) Lithuanian ( lt ) Norwegian ( no ) Polish ( pl ) Portuguese ( pt ) Romanian ( ro ) Russian ( ru ) Serbian ( sr ) Slovak ( sk ) Slovenian ( sl ) Spanish ( es ) Swahili ( sw ) Swedish ( sv ) Thai ( th ) Turkish ( tr ) Ukrainian ( uk ) Vietnamese ( vi ) Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-07 UTC. \ No newline at end of file diff --git a/text_content/models_experimental-models_290d5c68.txt b/text_content/models_experimental-models_290d5c68.txt new file mode 100644 index 0000000000000000000000000000000000000000..caecf460968f6dcf5920724bf32ac49f9222496a --- /dev/null +++ b/text_content/models_experimental-models_290d5c68.txt @@ -0,0 +1,5 @@ +URL: https://ai.google.dev/gemini-api/docs/models/experimental-models#gemini-1.5-flash-8b +Title: Gemini models  |  Gemini API  |  Google AI for Developers +================================================== + +Gemini models | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Gemini models 2.5 Pro spark Our most powerful thinking model with maximum response accuracy and state-of-the-art performance Input audio, images, video, and text, get text responses Tackle difficult problems, analyze large databases, and more Best for complex coding, reasoning, and multimodal understanding 2.5 Flash spark Our best model in terms of price-performance, offering well-rounded capabilities. Input audio, images, video, and text, and get text responses Model thinks as needed; or, you can configure a thinking budget Best for low latency, high volume tasks that require thinking 2.5 Flash-Lite experiment A Gemini 2.5 Flash model optimized for cost efficiency and low latency. Input audio, images, video, and text, and get text responses Most cost-efficient model supporting high throughput Best for real time, low latency use cases Note: Gemini 2.5 Pro and 2.5 Flash come with thinking on by default . If you're migrating from a non-thinking model such as 2.0 Pro or Flash, we recommend you to review the Thinking guide first. Model variants The Gemini API offers different models that are optimized for specific use cases. Here's a brief overview of Gemini variants that are available: Model variant Input(s) Output Optimized for Gemini 2.5 Pro gemini-2.5-pro Audio, images, videos, text, and PDF Text Enhanced thinking and reasoning, multimodal understanding, advanced coding, and more Gemini 2.5 Flash gemini-2.5-flash Audio, images, videos, and text Text Adaptive thinking, cost efficiency Gemini 2.5 Flash-Lite Preview gemini-2.5-flash-lite-preview-06-17 Text, image, video, audio Text Most cost-efficient model supporting high throughput Gemini 2.5 Flash Native Audio gemini-2.5-flash-preview-native-audio-dialog & gemini-2.5-flash-exp-native-audio-thinking-dialog Audio, videos, and text Text and audio, interleaved High quality, natural conversational audio outputs, with or without thinking Gemini 2.5 Flash Preview TTS gemini-2.5-flash-preview-tts Text Audio Low latency, controllable, single- and multi-speaker text-to-speech audio generation Gemini 2.5 Pro Preview TTS gemini-2.5-pro-preview-tts Text Audio Low latency, controllable, single- and multi-speaker text-to-speech audio generation Gemini 2.0 Flash gemini-2.0-flash Audio, images, videos, and text Text Next generation features, speed, and realtime streaming. Gemini 2.0 Flash Preview Image Generation gemini-2.0-flash-preview-image-generation Audio, images, videos, and text Text, images Conversational image generation and editing Gemini 2.0 Flash-Lite gemini-2.0-flash-lite Audio, images, videos, and text Text Cost efficiency and low latency Gemini 1.5 Flash gemini-1.5-flash Audio, images, videos, and text Text Fast and versatile performance across a diverse variety of tasks Gemini 1.5 Flash-8B gemini-1.5-flash-8b Audio, images, videos, and text Text High volume and lower intelligence tasks Gemini 1.5 Pro gemini-1.5-pro Audio, images, videos, and text Text Complex reasoning tasks requiring more intelligence Gemini Embedding gemini-embedding-exp Text Text embeddings Measuring the relatedness of text strings Imagen 4 imagen-4.0-generate-preview-06-06 imagen-4.0-ultra-generate-preview-06-06 Text Images Our most up-to-date image generation model Imagen 3 imagen-3.0-generate-002 Text Images High quality image generation model Veo 2 veo-2.0-generate-001 Text, images Video High quality video generation Gemini 2.5 Flash Live gemini-live-2.5-flash-preview Audio, video, and text Text, audio Low-latency bidirectional voice and video interactions Gemini 2.0 Flash Live gemini-2.0-flash-live-001 Audio, video, and text Text, audio Low-latency bidirectional voice and video interactions You can view the rate limits for each model on the rate limits page . Gemini 2.5 Pro Gemini 2.5 Pro is our state-of-the-art thinking model, capable of reasoning over complex problems in code, math, and STEM, as well as analyzing large datasets, codebases, and documents using long context. Try in Google AI Studio Model details Property Description id_card Model code gemini-2.5-pro save Supported data types Inputs Audio, images, video, text, and PDF Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 65,536 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Search grounding Supported Image generation Not supported Audio generation Not supported Live API Not supported Thinking Supported Batch API Supported 123 Versions Read the model version patterns for more details. Stable: gemini-2.5-pro Preview: gemini-2.5-pro-preview-06-05 Preview: gemini-2.5-pro-preview-05-06 Preview: gemini-2.5-pro-preview-03-25 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Our best model in terms of price-performance, offering well-rounded capabilities. 2.5 Flash is best for large scale processing, low-latency, high volume tasks that require thinking, and agentic use cases. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash save Supported data types Inputs Text, images, video, audio Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 65,536 handyman Capabilities Audio generation Not supported Caching Supported Code execution Supported Function calling Supported Image generation Not supported Search grounding Supported Structured outputs Supported Thinking Supported Tuning Not supported Batch API Supported 123 Versions Read the model version patterns for more details. Stable: gemini-2.5-flash Preview: gemini-2.5-flash-preview-05-20 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash-Lite Preview A Gemini 2.5 Flash model optimized for cost efficiency and low latency. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-lite-preview-06-17 save Supported data types Inputs Text, images, video, and audio Output Text token_auto Token limits [*] Input token limit 1,000,000 Output token limit 64,000 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported URL Context Supported Search grounding Supported Image generation Not supported Audio generation Not supported Live API Not supported Thinking Supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.5-flash-lite-preview-06-17 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Native Audio Our native audio dialog models, with and without thinking, available through the Live API . These models provide interactive and unstructured conversational experiences, with style and control prompting. Try native audio in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-preview-native-audio-dialog & models/gemini-2.5-flash-exp-native-audio-thinking-dialog save Supported data types Inputs Audio, video, text Output Audio and text token_auto Token limits [*] Input token limit 128,000 Output token limit 8,000 handyman Capabilities Audio generation Supported Caching Not supported Code execution Not supported Function calling Supported Image generation Not supported Search grounding Supported Structured outputs Not supported Thinking Supported Tuning Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.5-flash-preview-05-20 Experimental: gemini-2.5-flash-exp-native-audio-thinking-dialog calendar_month Latest update May 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Preview Text-to-Speech Gemini 2.5 Flash Preview TTS is our price-performant text-to-speech model, delivering high control and transparency for structured workflows like podcast generation, audiobooks, customer support, and more. Gemini 2.5 Flash rate limits are more restricted since it is an experimental / preview model. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-preview-tts save Supported data types Inputs Text Output Audio token_auto Token limits [*] Input token limit 8,000 Output token limit 16,000 handyman Capabilities Structured outputs Not supported Caching Not supported Tuning Not supported Function calling Not supported Code execution Not supported Search Not supported Audio generation Supported Live API Not supported Thinking Not supported 123 Versions Read the model version patterns for more details. gemini-2.5-flash-preview-tts calendar_month Latest update May 2025 Gemini 2.5 Pro Preview Text-to-Speech Gemini 2.5 Pro Preview TTS is our most powerful text-to-speech model, delivering high control and transparency for structured workflows like podcast generation, audiobooks, customer support, and more. Gemini 2.5 Pro rate limits are more restricted since it is an experimental / preview model. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-pro-preview-tts save Supported data types Inputs Text Output Audio token_auto Token limits [*] Input token limit 8,000 Output token limit 16,000 handyman Capabilities Structured outputs Not supported Caching Not supported Tuning Not supported Function calling Not supported Code execution Not supported Search Not supported Audio generation Supported Live API Not supported Thinking Not supported 123 Versions Read the model version patterns for more details. gemini-2.5-pro-preview-tts calendar_month Latest update May 2025 Gemini 2.0 Flash Gemini 2.0 Flash delivers next-gen features and improved capabilities, including superior speed, native tool use, and a 1M token context window. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Not supported Live API Supported Thinking Experimental Batch API Supported 123 Versions Read the model version patterns for more details. Latest: gemini-2.0-flash Stable: gemini-2.0-flash-001 Experimental: gemini-2.0-flash-exp calendar_month Latest update February 2025 cognition_2 Knowledge cutoff August 2024 Gemini 2.0 Flash Preview Image Generation Gemini 2.0 Flash Preview Image Generation delivers improved image generation features, including generating and editing images conversationally. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-preview-image-generation save Supported data types Inputs Audio, images, video, and text Output Text and images token_auto Token limits [*] Input token limit 32,000 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Not supported Code execution Not Supported Search Not Supported Image generation Supported Audio generation Not supported Live API Not Supported Thinking Not Supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.0-flash-preview-image-generation gemini-2.0-flash-preview-image-generation is not currently supported in a number of countries in Europe, Middle East & Africa calendar_month Latest update May 2025 cognition_2 Knowledge cutoff August 2024 Gemini 2.0 Flash-Lite A Gemini 2.0 Flash model optimized for cost efficiency and low latency. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-lite save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Not supported Search Not supported Image generation Not supported Audio generation Not supported Live API Not supported Batch API Supported 123 Versions Read the model version patterns for more details. Latest: gemini-2.0-flash-lite Stable: gemini-2.0-flash-lite-001 calendar_month Latest update February 2025 cognition_2 Knowledge cutoff August 2024 Gemini 1.5 Flash Gemini 1.5 Flash is a fast and versatile multimodal model for scaling across diverse tasks. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-flash save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 3,600 Maximum video length 1 hour Maximum audio length Approximately 9.5 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-flash-latest Latest stable: gemini-1.5-flash Stable: gemini-1.5-flash-001 gemini-1.5-flash-002 calendar_month Latest update September 2024 Gemini 1.5 Flash-8B Gemini 1.5 Flash-8B is a small model designed for lower intelligence tasks. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-flash-8b save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 3,600 Maximum video length 1 hour Maximum audio length Approximately 9.5 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-flash-8b-latest Latest stable: gemini-1.5-flash-8b Stable: gemini-1.5-flash-8b-001 calendar_month Latest update October 2024 Gemini 1.5 Pro Try Gemini 2.5 Pro Preview , our most advanced Gemini model to date. Gemini 1.5 Pro is a mid-size multimodal model that is optimized for a wide-range of reasoning tasks. 1.5 Pro can process large amounts of data at once, including 2 hours of video, 19 hours of audio, codebases with 60,000 lines of code, or 2,000 pages of text. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-pro save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 2,097,152 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 7,200 Maximum video length 2 hours Maximum audio length Approximately 19 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-pro-latest Latest stable: gemini-1.5-pro Stable: gemini-1.5-pro-001 gemini-1.5-pro-002 calendar_month Latest update September 2024 Imagen 4 Imagen 4 is our latest image model, capable of generating highly detailed images with rich lighting, significantly better text rendering, and higher resolution output than previous models. Model details Property Description id_card Model code Gemini API imagen-4.0-generate-preview-06-06 imagen-4.0-ultra-generate-preview-06-06 save Supported data types Input Text Output Images token_auto Token limits [*] Input token limit 480 tokens (text) Output images 1 (Ultra) 1 to 4 (Standard) calendar_month Latest update June 2025 Imagen 3 Imagen 3 is our highest quality text-to-image model, capable of generating images with even better detail, richer lighting and fewer distracting artifacts than our previous models. Model details Property Description id_card Model code Gemini API imagen-3.0-generate-002 save Supported data types Input Text Output Images token_auto Token limits [*] Input token limit N/A Output images Up to 4 calendar_month Latest update February 2025 Veo 2 Veo 2 is our high quality text- and image-to-video model, capable of generating detailed videos, capturing the artistic nuance in your prompts. Model details Property Description id_card Model code Gemini API veo-2.0-generate-001 save Supported data types Input Text, image Output Video token_auto Limits Text input N/A Image input Any image resolution and aspect ratio up to 20MB file size Output video Up to 2 calendar_month Latest update April 2025 Gemini 2.5 Flash Live The Gemini 2.5 Flash Live model works with the Live API to enable low-latency bidirectional voice and video interactions with Gemini. The model can process text, audio, and video input, and it can provide text and audio output. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-live-2.5-flash-preview save Supported data types Inputs Audio, video, and text Output Text, and audio token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Supported Thinking Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-live-2.5-flash-preview calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.0 Flash Live The Gemini 2.0 Flash Live model works with the Live API to enable low-latency bidirectional voice and video interactions with Gemini. The model can process text, audio, and video input, and it can provide text and audio output. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-live-001 save Supported data types Inputs Audio, video, and text Output Text, and audio token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Supported Thinking Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.0-flash-live-001 calendar_month Latest update April 2025 cognition_2 Knowledge cutoff August 2024 Gemini Embedding Experimental Gemini embedding achieves a SOTA performance across many key dimensions including code, multi-lingual, and retrieval. Gemini Embedding rate limits are more restricted since it is an experimental model. Model details Property Description id_card Model code Gemini API gemini-embedding-exp-03-07 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 8,192 Output dimension size Elastic, supports: 3072, 1536, or 768 calendar_month Latest update March 2025 Text Embedding and Embedding Text Embedding Try our new experimental Gemini embedding model which achieves state-of-the-art performance. Text embeddings are used to measure the relatedness of strings and are widely used in many AI applications. text-embedding-004 achieves a stronger retrieval performance and outperforms existing models with comparable dimensions, on the standard MTEB embedding benchmarks. Model details Property Description id_card Model code Gemini API models/text-embedding-004 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 2,048 Output dimension size 768 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Not supported calendar_month Latest update April 2024 Embedding Note: Text Embedding is the newer version of the Embedding model. If you're creating a new project, use Text Embedding. You can use the Embedding model to generate text embeddings for input text. The Embedding model is optimized for creating embeddings with 768 dimensions for text of up to 2,048 tokens. Embedding model details Property Description id_card Model code models/embedding-001 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 2,048 Output dimension size 768 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Not supported calendar_month Latest update December 2023 AQA You can use the AQA model to perform Attributed Question-Answering (AQA)–related tasks over a document, corpus, or a set of passages. The AQA model returns answers to questions that are grounded in provided sources, along with estimating answerable probability. Model details Property Description id_card Model code models/aqa save Supported data types Input Text Output Text language Supported language English token_auto Token limits [*] Input token limit 7,168 Output token limit 1,024 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Supported calendar_month Latest update December 2023 See the examples to explore the capabilities of these model variations. [*] A token is equivalent to about 4 characters for Gemini models. 100 tokens are about 60-80 English words. Model version name patterns Gemini models are available in either stable , preview , or experimental versions. In your code, you can use one of the following model name formats to specify which model and version you want to use. Latest stable Points to the most recent stable version released for the specified model generation and variation. To specify the latest stable version, use the following pattern: -- . For example, gemini-2.0-flash . Stable Points to a specific stable model. Stable models usually don't change. Most production apps should use a specific stable model. To specify a stable version, use the following pattern: --- . For example, gemini-2.0-flash-001 . Preview Points to a preview model which may not be suitable for production use, come with more restrictive rate limits, but may have billing enabled. To specify a preview version, use the following pattern: --- . For example, gemini-2.5-pro-preview-06-05 . Experimental Points to an experimental model which may not be suitable for production use and come with more restrictive rate limits. We release experimental models to gather feedback and get our latest updates into the hands of developers quickly. To specify an experimental version, use the following pattern: --- . For example, gemini-2.0-pro-exp-02-05 . Experimental models In addition to stable models, the Gemini API offers experimental models which may not be suitable for production use and come with more restrictive rate limits. We release experimental models to gather feedback, get our latest updates into the hands of developers quickly, and highlight the pace of innovation happening at Google. What we learn from experimental launches informs how we release models more widely. An experimental model can be swapped for another without prior notice. We don't guarantee that an experimental model will become a stable model in the future. Previous experimental models As new versions or stable releases become available, we remove and replace experimental models. You can find the previous experimental models we released in the following section along with the replacement version: Model code Base model Replacement version gemini-2.5-flash-preview-04-17 Gemini 2.5 Flash gemini-2.5-flash-preview-05-20 gemini-2.0-flash-exp-image-generation Gemini 2.0 Flash gemini-2.0-flash-preview-image-generation gemini-2.5-pro-preview-06-05 Gemini 2.5 Pro gemini-2.5-pro gemini-2.5-pro-preview-05-06 Gemini 2.5 Pro gemini-2.5-pro gemini-2.5-pro-preview-03-25 Gemini 2.5 Pro gemini-2.5-pro gemini-2.0-flash-thinking-exp-01-21 Gemini 2.5 Flash gemini-2.5-flash-preview-04-17 gemini-2.0-pro-exp-02-05 Gemini 2.0 Pro Experimental gemini-2.5-pro-preview-03-25 gemini-2.0-flash-exp Gemini 2.0 Flash gemini-2.0-flash gemini-exp-1206 Gemini 2.0 Pro gemini-2.0-pro-exp-02-05 gemini-2.0-flash-thinking-exp-1219 Gemini 2.0 Flash Thinking gemini-2.0-flash-thinking-exp-01-21 gemini-exp-1121 Gemini gemini-exp-1206 gemini-exp-1114 Gemini gemini-exp-1206 gemini-1.5-pro-exp-0827 Gemini 1.5 Pro gemini-exp-1206 gemini-1.5-pro-exp-0801 Gemini 1.5 Pro gemini-exp-1206 gemini-1.5-flash-8b-exp-0924 Gemini 1.5 Flash-8B gemini-1.5-flash-8b gemini-1.5-flash-8b-exp-0827 Gemini 1.5 Flash-8B gemini-1.5-flash-8b Supported languages Gemini models are trained to work with the following languages: Arabic ( ar ) Bengali ( bn ) Bulgarian ( bg ) Chinese simplified and traditional ( zh ) Croatian ( hr ) Czech ( cs ) Danish ( da ) Dutch ( nl ) English ( en ) Estonian ( et ) Finnish ( fi ) French ( fr ) German ( de ) Greek ( el ) Hebrew ( iw ) Hindi ( hi ) Hungarian ( hu ) Indonesian ( id ) Italian ( it ) Japanese ( ja ) Korean ( ko ) Latvian ( lv ) Lithuanian ( lt ) Norwegian ( no ) Polish ( pl ) Portuguese ( pt ) Romanian ( ro ) Russian ( ru ) Serbian ( sr ) Slovak ( sk ) Slovenian ( sl ) Spanish ( es ) Swahili ( sw ) Swedish ( sv ) Thai ( th ) Turkish ( tr ) Ukrainian ( uk ) Vietnamese ( vi ) Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-07 UTC. \ No newline at end of file diff --git a/text_content/models_experimental-models_453536b9.txt b/text_content/models_experimental-models_453536b9.txt new file mode 100644 index 0000000000000000000000000000000000000000..15d09018db2b544cddcc7f415716978eb9551c10 --- /dev/null +++ b/text_content/models_experimental-models_453536b9.txt @@ -0,0 +1,5 @@ +URL: https://ai.google.dev/gemini-api/docs/models/experimental-models#gemini-2.5-flash-preview-tts +Title: Gemini models  |  Gemini API  |  Google AI for Developers +================================================== + +Gemini models | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Gemini models 2.5 Pro spark Our most powerful thinking model with maximum response accuracy and state-of-the-art performance Input audio, images, video, and text, get text responses Tackle difficult problems, analyze large databases, and more Best for complex coding, reasoning, and multimodal understanding 2.5 Flash spark Our best model in terms of price-performance, offering well-rounded capabilities. Input audio, images, video, and text, and get text responses Model thinks as needed; or, you can configure a thinking budget Best for low latency, high volume tasks that require thinking 2.5 Flash-Lite experiment A Gemini 2.5 Flash model optimized for cost efficiency and low latency. Input audio, images, video, and text, and get text responses Most cost-efficient model supporting high throughput Best for real time, low latency use cases Note: Gemini 2.5 Pro and 2.5 Flash come with thinking on by default . If you're migrating from a non-thinking model such as 2.0 Pro or Flash, we recommend you to review the Thinking guide first. Model variants The Gemini API offers different models that are optimized for specific use cases. Here's a brief overview of Gemini variants that are available: Model variant Input(s) Output Optimized for Gemini 2.5 Pro gemini-2.5-pro Audio, images, videos, text, and PDF Text Enhanced thinking and reasoning, multimodal understanding, advanced coding, and more Gemini 2.5 Flash gemini-2.5-flash Audio, images, videos, and text Text Adaptive thinking, cost efficiency Gemini 2.5 Flash-Lite Preview gemini-2.5-flash-lite-preview-06-17 Text, image, video, audio Text Most cost-efficient model supporting high throughput Gemini 2.5 Flash Native Audio gemini-2.5-flash-preview-native-audio-dialog & gemini-2.5-flash-exp-native-audio-thinking-dialog Audio, videos, and text Text and audio, interleaved High quality, natural conversational audio outputs, with or without thinking Gemini 2.5 Flash Preview TTS gemini-2.5-flash-preview-tts Text Audio Low latency, controllable, single- and multi-speaker text-to-speech audio generation Gemini 2.5 Pro Preview TTS gemini-2.5-pro-preview-tts Text Audio Low latency, controllable, single- and multi-speaker text-to-speech audio generation Gemini 2.0 Flash gemini-2.0-flash Audio, images, videos, and text Text Next generation features, speed, and realtime streaming. Gemini 2.0 Flash Preview Image Generation gemini-2.0-flash-preview-image-generation Audio, images, videos, and text Text, images Conversational image generation and editing Gemini 2.0 Flash-Lite gemini-2.0-flash-lite Audio, images, videos, and text Text Cost efficiency and low latency Gemini 1.5 Flash gemini-1.5-flash Audio, images, videos, and text Text Fast and versatile performance across a diverse variety of tasks Gemini 1.5 Flash-8B gemini-1.5-flash-8b Audio, images, videos, and text Text High volume and lower intelligence tasks Gemini 1.5 Pro gemini-1.5-pro Audio, images, videos, and text Text Complex reasoning tasks requiring more intelligence Gemini Embedding gemini-embedding-exp Text Text embeddings Measuring the relatedness of text strings Imagen 4 imagen-4.0-generate-preview-06-06 imagen-4.0-ultra-generate-preview-06-06 Text Images Our most up-to-date image generation model Imagen 3 imagen-3.0-generate-002 Text Images High quality image generation model Veo 2 veo-2.0-generate-001 Text, images Video High quality video generation Gemini 2.5 Flash Live gemini-live-2.5-flash-preview Audio, video, and text Text, audio Low-latency bidirectional voice and video interactions Gemini 2.0 Flash Live gemini-2.0-flash-live-001 Audio, video, and text Text, audio Low-latency bidirectional voice and video interactions You can view the rate limits for each model on the rate limits page . Gemini 2.5 Pro Gemini 2.5 Pro is our state-of-the-art thinking model, capable of reasoning over complex problems in code, math, and STEM, as well as analyzing large datasets, codebases, and documents using long context. Try in Google AI Studio Model details Property Description id_card Model code gemini-2.5-pro save Supported data types Inputs Audio, images, video, text, and PDF Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 65,536 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Search grounding Supported Image generation Not supported Audio generation Not supported Live API Not supported Thinking Supported Batch API Supported 123 Versions Read the model version patterns for more details. Stable: gemini-2.5-pro Preview: gemini-2.5-pro-preview-06-05 Preview: gemini-2.5-pro-preview-05-06 Preview: gemini-2.5-pro-preview-03-25 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Our best model in terms of price-performance, offering well-rounded capabilities. 2.5 Flash is best for large scale processing, low-latency, high volume tasks that require thinking, and agentic use cases. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash save Supported data types Inputs Text, images, video, audio Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 65,536 handyman Capabilities Audio generation Not supported Caching Supported Code execution Supported Function calling Supported Image generation Not supported Search grounding Supported Structured outputs Supported Thinking Supported Tuning Not supported Batch API Supported 123 Versions Read the model version patterns for more details. Stable: gemini-2.5-flash Preview: gemini-2.5-flash-preview-05-20 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash-Lite Preview A Gemini 2.5 Flash model optimized for cost efficiency and low latency. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-lite-preview-06-17 save Supported data types Inputs Text, images, video, and audio Output Text token_auto Token limits [*] Input token limit 1,000,000 Output token limit 64,000 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported URL Context Supported Search grounding Supported Image generation Not supported Audio generation Not supported Live API Not supported Thinking Supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.5-flash-lite-preview-06-17 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Native Audio Our native audio dialog models, with and without thinking, available through the Live API . These models provide interactive and unstructured conversational experiences, with style and control prompting. Try native audio in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-preview-native-audio-dialog & models/gemini-2.5-flash-exp-native-audio-thinking-dialog save Supported data types Inputs Audio, video, text Output Audio and text token_auto Token limits [*] Input token limit 128,000 Output token limit 8,000 handyman Capabilities Audio generation Supported Caching Not supported Code execution Not supported Function calling Supported Image generation Not supported Search grounding Supported Structured outputs Not supported Thinking Supported Tuning Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.5-flash-preview-05-20 Experimental: gemini-2.5-flash-exp-native-audio-thinking-dialog calendar_month Latest update May 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Preview Text-to-Speech Gemini 2.5 Flash Preview TTS is our price-performant text-to-speech model, delivering high control and transparency for structured workflows like podcast generation, audiobooks, customer support, and more. Gemini 2.5 Flash rate limits are more restricted since it is an experimental / preview model. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-preview-tts save Supported data types Inputs Text Output Audio token_auto Token limits [*] Input token limit 8,000 Output token limit 16,000 handyman Capabilities Structured outputs Not supported Caching Not supported Tuning Not supported Function calling Not supported Code execution Not supported Search Not supported Audio generation Supported Live API Not supported Thinking Not supported 123 Versions Read the model version patterns for more details. gemini-2.5-flash-preview-tts calendar_month Latest update May 2025 Gemini 2.5 Pro Preview Text-to-Speech Gemini 2.5 Pro Preview TTS is our most powerful text-to-speech model, delivering high control and transparency for structured workflows like podcast generation, audiobooks, customer support, and more. Gemini 2.5 Pro rate limits are more restricted since it is an experimental / preview model. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-pro-preview-tts save Supported data types Inputs Text Output Audio token_auto Token limits [*] Input token limit 8,000 Output token limit 16,000 handyman Capabilities Structured outputs Not supported Caching Not supported Tuning Not supported Function calling Not supported Code execution Not supported Search Not supported Audio generation Supported Live API Not supported Thinking Not supported 123 Versions Read the model version patterns for more details. gemini-2.5-pro-preview-tts calendar_month Latest update May 2025 Gemini 2.0 Flash Gemini 2.0 Flash delivers next-gen features and improved capabilities, including superior speed, native tool use, and a 1M token context window. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Not supported Live API Supported Thinking Experimental Batch API Supported 123 Versions Read the model version patterns for more details. Latest: gemini-2.0-flash Stable: gemini-2.0-flash-001 Experimental: gemini-2.0-flash-exp calendar_month Latest update February 2025 cognition_2 Knowledge cutoff August 2024 Gemini 2.0 Flash Preview Image Generation Gemini 2.0 Flash Preview Image Generation delivers improved image generation features, including generating and editing images conversationally. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-preview-image-generation save Supported data types Inputs Audio, images, video, and text Output Text and images token_auto Token limits [*] Input token limit 32,000 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Not supported Code execution Not Supported Search Not Supported Image generation Supported Audio generation Not supported Live API Not Supported Thinking Not Supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.0-flash-preview-image-generation gemini-2.0-flash-preview-image-generation is not currently supported in a number of countries in Europe, Middle East & Africa calendar_month Latest update May 2025 cognition_2 Knowledge cutoff August 2024 Gemini 2.0 Flash-Lite A Gemini 2.0 Flash model optimized for cost efficiency and low latency. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-lite save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Not supported Search Not supported Image generation Not supported Audio generation Not supported Live API Not supported Batch API Supported 123 Versions Read the model version patterns for more details. Latest: gemini-2.0-flash-lite Stable: gemini-2.0-flash-lite-001 calendar_month Latest update February 2025 cognition_2 Knowledge cutoff August 2024 Gemini 1.5 Flash Gemini 1.5 Flash is a fast and versatile multimodal model for scaling across diverse tasks. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-flash save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 3,600 Maximum video length 1 hour Maximum audio length Approximately 9.5 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-flash-latest Latest stable: gemini-1.5-flash Stable: gemini-1.5-flash-001 gemini-1.5-flash-002 calendar_month Latest update September 2024 Gemini 1.5 Flash-8B Gemini 1.5 Flash-8B is a small model designed for lower intelligence tasks. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-flash-8b save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 3,600 Maximum video length 1 hour Maximum audio length Approximately 9.5 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-flash-8b-latest Latest stable: gemini-1.5-flash-8b Stable: gemini-1.5-flash-8b-001 calendar_month Latest update October 2024 Gemini 1.5 Pro Try Gemini 2.5 Pro Preview , our most advanced Gemini model to date. Gemini 1.5 Pro is a mid-size multimodal model that is optimized for a wide-range of reasoning tasks. 1.5 Pro can process large amounts of data at once, including 2 hours of video, 19 hours of audio, codebases with 60,000 lines of code, or 2,000 pages of text. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-pro save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 2,097,152 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 7,200 Maximum video length 2 hours Maximum audio length Approximately 19 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-pro-latest Latest stable: gemini-1.5-pro Stable: gemini-1.5-pro-001 gemini-1.5-pro-002 calendar_month Latest update September 2024 Imagen 4 Imagen 4 is our latest image model, capable of generating highly detailed images with rich lighting, significantly better text rendering, and higher resolution output than previous models. Model details Property Description id_card Model code Gemini API imagen-4.0-generate-preview-06-06 imagen-4.0-ultra-generate-preview-06-06 save Supported data types Input Text Output Images token_auto Token limits [*] Input token limit 480 tokens (text) Output images 1 (Ultra) 1 to 4 (Standard) calendar_month Latest update June 2025 Imagen 3 Imagen 3 is our highest quality text-to-image model, capable of generating images with even better detail, richer lighting and fewer distracting artifacts than our previous models. Model details Property Description id_card Model code Gemini API imagen-3.0-generate-002 save Supported data types Input Text Output Images token_auto Token limits [*] Input token limit N/A Output images Up to 4 calendar_month Latest update February 2025 Veo 2 Veo 2 is our high quality text- and image-to-video model, capable of generating detailed videos, capturing the artistic nuance in your prompts. Model details Property Description id_card Model code Gemini API veo-2.0-generate-001 save Supported data types Input Text, image Output Video token_auto Limits Text input N/A Image input Any image resolution and aspect ratio up to 20MB file size Output video Up to 2 calendar_month Latest update April 2025 Gemini 2.5 Flash Live The Gemini 2.5 Flash Live model works with the Live API to enable low-latency bidirectional voice and video interactions with Gemini. The model can process text, audio, and video input, and it can provide text and audio output. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-live-2.5-flash-preview save Supported data types Inputs Audio, video, and text Output Text, and audio token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Supported Thinking Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-live-2.5-flash-preview calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.0 Flash Live The Gemini 2.0 Flash Live model works with the Live API to enable low-latency bidirectional voice and video interactions with Gemini. The model can process text, audio, and video input, and it can provide text and audio output. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-live-001 save Supported data types Inputs Audio, video, and text Output Text, and audio token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Supported Thinking Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.0-flash-live-001 calendar_month Latest update April 2025 cognition_2 Knowledge cutoff August 2024 Gemini Embedding Experimental Gemini embedding achieves a SOTA performance across many key dimensions including code, multi-lingual, and retrieval. Gemini Embedding rate limits are more restricted since it is an experimental model. Model details Property Description id_card Model code Gemini API gemini-embedding-exp-03-07 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 8,192 Output dimension size Elastic, supports: 3072, 1536, or 768 calendar_month Latest update March 2025 Text Embedding and Embedding Text Embedding Try our new experimental Gemini embedding model which achieves state-of-the-art performance. Text embeddings are used to measure the relatedness of strings and are widely used in many AI applications. text-embedding-004 achieves a stronger retrieval performance and outperforms existing models with comparable dimensions, on the standard MTEB embedding benchmarks. Model details Property Description id_card Model code Gemini API models/text-embedding-004 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 2,048 Output dimension size 768 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Not supported calendar_month Latest update April 2024 Embedding Note: Text Embedding is the newer version of the Embedding model. If you're creating a new project, use Text Embedding. You can use the Embedding model to generate text embeddings for input text. The Embedding model is optimized for creating embeddings with 768 dimensions for text of up to 2,048 tokens. Embedding model details Property Description id_card Model code models/embedding-001 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 2,048 Output dimension size 768 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Not supported calendar_month Latest update December 2023 AQA You can use the AQA model to perform Attributed Question-Answering (AQA)–related tasks over a document, corpus, or a set of passages. The AQA model returns answers to questions that are grounded in provided sources, along with estimating answerable probability. Model details Property Description id_card Model code models/aqa save Supported data types Input Text Output Text language Supported language English token_auto Token limits [*] Input token limit 7,168 Output token limit 1,024 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Supported calendar_month Latest update December 2023 See the examples to explore the capabilities of these model variations. [*] A token is equivalent to about 4 characters for Gemini models. 100 tokens are about 60-80 English words. Model version name patterns Gemini models are available in either stable , preview , or experimental versions. In your code, you can use one of the following model name formats to specify which model and version you want to use. Latest stable Points to the most recent stable version released for the specified model generation and variation. To specify the latest stable version, use the following pattern: -- . For example, gemini-2.0-flash . Stable Points to a specific stable model. Stable models usually don't change. Most production apps should use a specific stable model. To specify a stable version, use the following pattern: --- . For example, gemini-2.0-flash-001 . Preview Points to a preview model which may not be suitable for production use, come with more restrictive rate limits, but may have billing enabled. To specify a preview version, use the following pattern: --- . For example, gemini-2.5-pro-preview-06-05 . Experimental Points to an experimental model which may not be suitable for production use and come with more restrictive rate limits. We release experimental models to gather feedback and get our latest updates into the hands of developers quickly. To specify an experimental version, use the following pattern: --- . For example, gemini-2.0-pro-exp-02-05 . Experimental models In addition to stable models, the Gemini API offers experimental models which may not be suitable for production use and come with more restrictive rate limits. We release experimental models to gather feedback, get our latest updates into the hands of developers quickly, and highlight the pace of innovation happening at Google. What we learn from experimental launches informs how we release models more widely. An experimental model can be swapped for another without prior notice. We don't guarantee that an experimental model will become a stable model in the future. Previous experimental models As new versions or stable releases become available, we remove and replace experimental models. You can find the previous experimental models we released in the following section along with the replacement version: Model code Base model Replacement version gemini-2.5-flash-preview-04-17 Gemini 2.5 Flash gemini-2.5-flash-preview-05-20 gemini-2.0-flash-exp-image-generation Gemini 2.0 Flash gemini-2.0-flash-preview-image-generation gemini-2.5-pro-preview-06-05 Gemini 2.5 Pro gemini-2.5-pro gemini-2.5-pro-preview-05-06 Gemini 2.5 Pro gemini-2.5-pro gemini-2.5-pro-preview-03-25 Gemini 2.5 Pro gemini-2.5-pro gemini-2.0-flash-thinking-exp-01-21 Gemini 2.5 Flash gemini-2.5-flash-preview-04-17 gemini-2.0-pro-exp-02-05 Gemini 2.0 Pro Experimental gemini-2.5-pro-preview-03-25 gemini-2.0-flash-exp Gemini 2.0 Flash gemini-2.0-flash gemini-exp-1206 Gemini 2.0 Pro gemini-2.0-pro-exp-02-05 gemini-2.0-flash-thinking-exp-1219 Gemini 2.0 Flash Thinking gemini-2.0-flash-thinking-exp-01-21 gemini-exp-1121 Gemini gemini-exp-1206 gemini-exp-1114 Gemini gemini-exp-1206 gemini-1.5-pro-exp-0827 Gemini 1.5 Pro gemini-exp-1206 gemini-1.5-pro-exp-0801 Gemini 1.5 Pro gemini-exp-1206 gemini-1.5-flash-8b-exp-0924 Gemini 1.5 Flash-8B gemini-1.5-flash-8b gemini-1.5-flash-8b-exp-0827 Gemini 1.5 Flash-8B gemini-1.5-flash-8b Supported languages Gemini models are trained to work with the following languages: Arabic ( ar ) Bengali ( bn ) Bulgarian ( bg ) Chinese simplified and traditional ( zh ) Croatian ( hr ) Czech ( cs ) Danish ( da ) Dutch ( nl ) English ( en ) Estonian ( et ) Finnish ( fi ) French ( fr ) German ( de ) Greek ( el ) Hebrew ( iw ) Hindi ( hi ) Hungarian ( hu ) Indonesian ( id ) Italian ( it ) Japanese ( ja ) Korean ( ko ) Latvian ( lv ) Lithuanian ( lt ) Norwegian ( no ) Polish ( pl ) Portuguese ( pt ) Romanian ( ro ) Russian ( ru ) Serbian ( sr ) Slovak ( sk ) Slovenian ( sl ) Spanish ( es ) Swahili ( sw ) Swedish ( sv ) Thai ( th ) Turkish ( tr ) Ukrainian ( uk ) Vietnamese ( vi ) Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-07 UTC. \ No newline at end of file diff --git a/text_content/models_experimental-models_83147de1.txt b/text_content/models_experimental-models_83147de1.txt new file mode 100644 index 0000000000000000000000000000000000000000..ad0214c81a0e686a293f02d2700195da4e290eb2 --- /dev/null +++ b/text_content/models_experimental-models_83147de1.txt @@ -0,0 +1,5 @@ +URL: https://ai.google.dev/gemini-api/docs/models/experimental-models#imagen-4 +Title: Gemini models  |  Gemini API  |  Google AI for Developers +================================================== + +Gemini models | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Gemini models 2.5 Pro spark Our most powerful thinking model with maximum response accuracy and state-of-the-art performance Input audio, images, video, and text, get text responses Tackle difficult problems, analyze large databases, and more Best for complex coding, reasoning, and multimodal understanding 2.5 Flash spark Our best model in terms of price-performance, offering well-rounded capabilities. Input audio, images, video, and text, and get text responses Model thinks as needed; or, you can configure a thinking budget Best for low latency, high volume tasks that require thinking 2.5 Flash-Lite experiment A Gemini 2.5 Flash model optimized for cost efficiency and low latency. Input audio, images, video, and text, and get text responses Most cost-efficient model supporting high throughput Best for real time, low latency use cases Note: Gemini 2.5 Pro and 2.5 Flash come with thinking on by default . If you're migrating from a non-thinking model such as 2.0 Pro or Flash, we recommend you to review the Thinking guide first. Model variants The Gemini API offers different models that are optimized for specific use cases. Here's a brief overview of Gemini variants that are available: Model variant Input(s) Output Optimized for Gemini 2.5 Pro gemini-2.5-pro Audio, images, videos, text, and PDF Text Enhanced thinking and reasoning, multimodal understanding, advanced coding, and more Gemini 2.5 Flash gemini-2.5-flash Audio, images, videos, and text Text Adaptive thinking, cost efficiency Gemini 2.5 Flash-Lite Preview gemini-2.5-flash-lite-preview-06-17 Text, image, video, audio Text Most cost-efficient model supporting high throughput Gemini 2.5 Flash Native Audio gemini-2.5-flash-preview-native-audio-dialog & gemini-2.5-flash-exp-native-audio-thinking-dialog Audio, videos, and text Text and audio, interleaved High quality, natural conversational audio outputs, with or without thinking Gemini 2.5 Flash Preview TTS gemini-2.5-flash-preview-tts Text Audio Low latency, controllable, single- and multi-speaker text-to-speech audio generation Gemini 2.5 Pro Preview TTS gemini-2.5-pro-preview-tts Text Audio Low latency, controllable, single- and multi-speaker text-to-speech audio generation Gemini 2.0 Flash gemini-2.0-flash Audio, images, videos, and text Text Next generation features, speed, and realtime streaming. Gemini 2.0 Flash Preview Image Generation gemini-2.0-flash-preview-image-generation Audio, images, videos, and text Text, images Conversational image generation and editing Gemini 2.0 Flash-Lite gemini-2.0-flash-lite Audio, images, videos, and text Text Cost efficiency and low latency Gemini 1.5 Flash gemini-1.5-flash Audio, images, videos, and text Text Fast and versatile performance across a diverse variety of tasks Gemini 1.5 Flash-8B gemini-1.5-flash-8b Audio, images, videos, and text Text High volume and lower intelligence tasks Gemini 1.5 Pro gemini-1.5-pro Audio, images, videos, and text Text Complex reasoning tasks requiring more intelligence Gemini Embedding gemini-embedding-exp Text Text embeddings Measuring the relatedness of text strings Imagen 4 imagen-4.0-generate-preview-06-06 imagen-4.0-ultra-generate-preview-06-06 Text Images Our most up-to-date image generation model Imagen 3 imagen-3.0-generate-002 Text Images High quality image generation model Veo 2 veo-2.0-generate-001 Text, images Video High quality video generation Gemini 2.5 Flash Live gemini-live-2.5-flash-preview Audio, video, and text Text, audio Low-latency bidirectional voice and video interactions Gemini 2.0 Flash Live gemini-2.0-flash-live-001 Audio, video, and text Text, audio Low-latency bidirectional voice and video interactions You can view the rate limits for each model on the rate limits page . Gemini 2.5 Pro Gemini 2.5 Pro is our state-of-the-art thinking model, capable of reasoning over complex problems in code, math, and STEM, as well as analyzing large datasets, codebases, and documents using long context. Try in Google AI Studio Model details Property Description id_card Model code gemini-2.5-pro save Supported data types Inputs Audio, images, video, text, and PDF Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 65,536 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Search grounding Supported Image generation Not supported Audio generation Not supported Live API Not supported Thinking Supported Batch API Supported 123 Versions Read the model version patterns for more details. Stable: gemini-2.5-pro Preview: gemini-2.5-pro-preview-06-05 Preview: gemini-2.5-pro-preview-05-06 Preview: gemini-2.5-pro-preview-03-25 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Our best model in terms of price-performance, offering well-rounded capabilities. 2.5 Flash is best for large scale processing, low-latency, high volume tasks that require thinking, and agentic use cases. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash save Supported data types Inputs Text, images, video, audio Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 65,536 handyman Capabilities Audio generation Not supported Caching Supported Code execution Supported Function calling Supported Image generation Not supported Search grounding Supported Structured outputs Supported Thinking Supported Tuning Not supported Batch API Supported 123 Versions Read the model version patterns for more details. Stable: gemini-2.5-flash Preview: gemini-2.5-flash-preview-05-20 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash-Lite Preview A Gemini 2.5 Flash model optimized for cost efficiency and low latency. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-lite-preview-06-17 save Supported data types Inputs Text, images, video, and audio Output Text token_auto Token limits [*] Input token limit 1,000,000 Output token limit 64,000 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported URL Context Supported Search grounding Supported Image generation Not supported Audio generation Not supported Live API Not supported Thinking Supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.5-flash-lite-preview-06-17 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Native Audio Our native audio dialog models, with and without thinking, available through the Live API . These models provide interactive and unstructured conversational experiences, with style and control prompting. Try native audio in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-preview-native-audio-dialog & models/gemini-2.5-flash-exp-native-audio-thinking-dialog save Supported data types Inputs Audio, video, text Output Audio and text token_auto Token limits [*] Input token limit 128,000 Output token limit 8,000 handyman Capabilities Audio generation Supported Caching Not supported Code execution Not supported Function calling Supported Image generation Not supported Search grounding Supported Structured outputs Not supported Thinking Supported Tuning Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.5-flash-preview-05-20 Experimental: gemini-2.5-flash-exp-native-audio-thinking-dialog calendar_month Latest update May 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Preview Text-to-Speech Gemini 2.5 Flash Preview TTS is our price-performant text-to-speech model, delivering high control and transparency for structured workflows like podcast generation, audiobooks, customer support, and more. Gemini 2.5 Flash rate limits are more restricted since it is an experimental / preview model. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-preview-tts save Supported data types Inputs Text Output Audio token_auto Token limits [*] Input token limit 8,000 Output token limit 16,000 handyman Capabilities Structured outputs Not supported Caching Not supported Tuning Not supported Function calling Not supported Code execution Not supported Search Not supported Audio generation Supported Live API Not supported Thinking Not supported 123 Versions Read the model version patterns for more details. gemini-2.5-flash-preview-tts calendar_month Latest update May 2025 Gemini 2.5 Pro Preview Text-to-Speech Gemini 2.5 Pro Preview TTS is our most powerful text-to-speech model, delivering high control and transparency for structured workflows like podcast generation, audiobooks, customer support, and more. Gemini 2.5 Pro rate limits are more restricted since it is an experimental / preview model. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-pro-preview-tts save Supported data types Inputs Text Output Audio token_auto Token limits [*] Input token limit 8,000 Output token limit 16,000 handyman Capabilities Structured outputs Not supported Caching Not supported Tuning Not supported Function calling Not supported Code execution Not supported Search Not supported Audio generation Supported Live API Not supported Thinking Not supported 123 Versions Read the model version patterns for more details. gemini-2.5-pro-preview-tts calendar_month Latest update May 2025 Gemini 2.0 Flash Gemini 2.0 Flash delivers next-gen features and improved capabilities, including superior speed, native tool use, and a 1M token context window. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Not supported Live API Supported Thinking Experimental Batch API Supported 123 Versions Read the model version patterns for more details. Latest: gemini-2.0-flash Stable: gemini-2.0-flash-001 Experimental: gemini-2.0-flash-exp calendar_month Latest update February 2025 cognition_2 Knowledge cutoff August 2024 Gemini 2.0 Flash Preview Image Generation Gemini 2.0 Flash Preview Image Generation delivers improved image generation features, including generating and editing images conversationally. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-preview-image-generation save Supported data types Inputs Audio, images, video, and text Output Text and images token_auto Token limits [*] Input token limit 32,000 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Not supported Code execution Not Supported Search Not Supported Image generation Supported Audio generation Not supported Live API Not Supported Thinking Not Supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.0-flash-preview-image-generation gemini-2.0-flash-preview-image-generation is not currently supported in a number of countries in Europe, Middle East & Africa calendar_month Latest update May 2025 cognition_2 Knowledge cutoff August 2024 Gemini 2.0 Flash-Lite A Gemini 2.0 Flash model optimized for cost efficiency and low latency. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-lite save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Not supported Search Not supported Image generation Not supported Audio generation Not supported Live API Not supported Batch API Supported 123 Versions Read the model version patterns for more details. Latest: gemini-2.0-flash-lite Stable: gemini-2.0-flash-lite-001 calendar_month Latest update February 2025 cognition_2 Knowledge cutoff August 2024 Gemini 1.5 Flash Gemini 1.5 Flash is a fast and versatile multimodal model for scaling across diverse tasks. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-flash save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 3,600 Maximum video length 1 hour Maximum audio length Approximately 9.5 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-flash-latest Latest stable: gemini-1.5-flash Stable: gemini-1.5-flash-001 gemini-1.5-flash-002 calendar_month Latest update September 2024 Gemini 1.5 Flash-8B Gemini 1.5 Flash-8B is a small model designed for lower intelligence tasks. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-flash-8b save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 3,600 Maximum video length 1 hour Maximum audio length Approximately 9.5 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-flash-8b-latest Latest stable: gemini-1.5-flash-8b Stable: gemini-1.5-flash-8b-001 calendar_month Latest update October 2024 Gemini 1.5 Pro Try Gemini 2.5 Pro Preview , our most advanced Gemini model to date. Gemini 1.5 Pro is a mid-size multimodal model that is optimized for a wide-range of reasoning tasks. 1.5 Pro can process large amounts of data at once, including 2 hours of video, 19 hours of audio, codebases with 60,000 lines of code, or 2,000 pages of text. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-pro save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 2,097,152 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 7,200 Maximum video length 2 hours Maximum audio length Approximately 19 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-pro-latest Latest stable: gemini-1.5-pro Stable: gemini-1.5-pro-001 gemini-1.5-pro-002 calendar_month Latest update September 2024 Imagen 4 Imagen 4 is our latest image model, capable of generating highly detailed images with rich lighting, significantly better text rendering, and higher resolution output than previous models. Model details Property Description id_card Model code Gemini API imagen-4.0-generate-preview-06-06 imagen-4.0-ultra-generate-preview-06-06 save Supported data types Input Text Output Images token_auto Token limits [*] Input token limit 480 tokens (text) Output images 1 (Ultra) 1 to 4 (Standard) calendar_month Latest update June 2025 Imagen 3 Imagen 3 is our highest quality text-to-image model, capable of generating images with even better detail, richer lighting and fewer distracting artifacts than our previous models. Model details Property Description id_card Model code Gemini API imagen-3.0-generate-002 save Supported data types Input Text Output Images token_auto Token limits [*] Input token limit N/A Output images Up to 4 calendar_month Latest update February 2025 Veo 2 Veo 2 is our high quality text- and image-to-video model, capable of generating detailed videos, capturing the artistic nuance in your prompts. Model details Property Description id_card Model code Gemini API veo-2.0-generate-001 save Supported data types Input Text, image Output Video token_auto Limits Text input N/A Image input Any image resolution and aspect ratio up to 20MB file size Output video Up to 2 calendar_month Latest update April 2025 Gemini 2.5 Flash Live The Gemini 2.5 Flash Live model works with the Live API to enable low-latency bidirectional voice and video interactions with Gemini. The model can process text, audio, and video input, and it can provide text and audio output. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-live-2.5-flash-preview save Supported data types Inputs Audio, video, and text Output Text, and audio token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Supported Thinking Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-live-2.5-flash-preview calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.0 Flash Live The Gemini 2.0 Flash Live model works with the Live API to enable low-latency bidirectional voice and video interactions with Gemini. The model can process text, audio, and video input, and it can provide text and audio output. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-live-001 save Supported data types Inputs Audio, video, and text Output Text, and audio token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Supported Thinking Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.0-flash-live-001 calendar_month Latest update April 2025 cognition_2 Knowledge cutoff August 2024 Gemini Embedding Experimental Gemini embedding achieves a SOTA performance across many key dimensions including code, multi-lingual, and retrieval. Gemini Embedding rate limits are more restricted since it is an experimental model. Model details Property Description id_card Model code Gemini API gemini-embedding-exp-03-07 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 8,192 Output dimension size Elastic, supports: 3072, 1536, or 768 calendar_month Latest update March 2025 Text Embedding and Embedding Text Embedding Try our new experimental Gemini embedding model which achieves state-of-the-art performance. Text embeddings are used to measure the relatedness of strings and are widely used in many AI applications. text-embedding-004 achieves a stronger retrieval performance and outperforms existing models with comparable dimensions, on the standard MTEB embedding benchmarks. Model details Property Description id_card Model code Gemini API models/text-embedding-004 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 2,048 Output dimension size 768 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Not supported calendar_month Latest update April 2024 Embedding Note: Text Embedding is the newer version of the Embedding model. If you're creating a new project, use Text Embedding. You can use the Embedding model to generate text embeddings for input text. The Embedding model is optimized for creating embeddings with 768 dimensions for text of up to 2,048 tokens. Embedding model details Property Description id_card Model code models/embedding-001 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 2,048 Output dimension size 768 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Not supported calendar_month Latest update December 2023 AQA You can use the AQA model to perform Attributed Question-Answering (AQA)–related tasks over a document, corpus, or a set of passages. The AQA model returns answers to questions that are grounded in provided sources, along with estimating answerable probability. Model details Property Description id_card Model code models/aqa save Supported data types Input Text Output Text language Supported language English token_auto Token limits [*] Input token limit 7,168 Output token limit 1,024 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Supported calendar_month Latest update December 2023 See the examples to explore the capabilities of these model variations. [*] A token is equivalent to about 4 characters for Gemini models. 100 tokens are about 60-80 English words. Model version name patterns Gemini models are available in either stable , preview , or experimental versions. In your code, you can use one of the following model name formats to specify which model and version you want to use. Latest stable Points to the most recent stable version released for the specified model generation and variation. To specify the latest stable version, use the following pattern: -- . For example, gemini-2.0-flash . Stable Points to a specific stable model. Stable models usually don't change. Most production apps should use a specific stable model. To specify a stable version, use the following pattern: --- . For example, gemini-2.0-flash-001 . Preview Points to a preview model which may not be suitable for production use, come with more restrictive rate limits, but may have billing enabled. To specify a preview version, use the following pattern: --- . For example, gemini-2.5-pro-preview-06-05 . Experimental Points to an experimental model which may not be suitable for production use and come with more restrictive rate limits. We release experimental models to gather feedback and get our latest updates into the hands of developers quickly. To specify an experimental version, use the following pattern: --- . For example, gemini-2.0-pro-exp-02-05 . Experimental models In addition to stable models, the Gemini API offers experimental models which may not be suitable for production use and come with more restrictive rate limits. We release experimental models to gather feedback, get our latest updates into the hands of developers quickly, and highlight the pace of innovation happening at Google. What we learn from experimental launches informs how we release models more widely. An experimental model can be swapped for another without prior notice. We don't guarantee that an experimental model will become a stable model in the future. Previous experimental models As new versions or stable releases become available, we remove and replace experimental models. You can find the previous experimental models we released in the following section along with the replacement version: Model code Base model Replacement version gemini-2.5-flash-preview-04-17 Gemini 2.5 Flash gemini-2.5-flash-preview-05-20 gemini-2.0-flash-exp-image-generation Gemini 2.0 Flash gemini-2.0-flash-preview-image-generation gemini-2.5-pro-preview-06-05 Gemini 2.5 Pro gemini-2.5-pro gemini-2.5-pro-preview-05-06 Gemini 2.5 Pro gemini-2.5-pro gemini-2.5-pro-preview-03-25 Gemini 2.5 Pro gemini-2.5-pro gemini-2.0-flash-thinking-exp-01-21 Gemini 2.5 Flash gemini-2.5-flash-preview-04-17 gemini-2.0-pro-exp-02-05 Gemini 2.0 Pro Experimental gemini-2.5-pro-preview-03-25 gemini-2.0-flash-exp Gemini 2.0 Flash gemini-2.0-flash gemini-exp-1206 Gemini 2.0 Pro gemini-2.0-pro-exp-02-05 gemini-2.0-flash-thinking-exp-1219 Gemini 2.0 Flash Thinking gemini-2.0-flash-thinking-exp-01-21 gemini-exp-1121 Gemini gemini-exp-1206 gemini-exp-1114 Gemini gemini-exp-1206 gemini-1.5-pro-exp-0827 Gemini 1.5 Pro gemini-exp-1206 gemini-1.5-pro-exp-0801 Gemini 1.5 Pro gemini-exp-1206 gemini-1.5-flash-8b-exp-0924 Gemini 1.5 Flash-8B gemini-1.5-flash-8b gemini-1.5-flash-8b-exp-0827 Gemini 1.5 Flash-8B gemini-1.5-flash-8b Supported languages Gemini models are trained to work with the following languages: Arabic ( ar ) Bengali ( bn ) Bulgarian ( bg ) Chinese simplified and traditional ( zh ) Croatian ( hr ) Czech ( cs ) Danish ( da ) Dutch ( nl ) English ( en ) Estonian ( et ) Finnish ( fi ) French ( fr ) German ( de ) Greek ( el ) Hebrew ( iw ) Hindi ( hi ) Hungarian ( hu ) Indonesian ( id ) Italian ( it ) Japanese ( ja ) Korean ( ko ) Latvian ( lv ) Lithuanian ( lt ) Norwegian ( no ) Polish ( pl ) Portuguese ( pt ) Romanian ( ro ) Russian ( ru ) Serbian ( sr ) Slovak ( sk ) Slovenian ( sl ) Spanish ( es ) Swahili ( sw ) Swedish ( sv ) Thai ( th ) Turkish ( tr ) Ukrainian ( uk ) Vietnamese ( vi ) Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-07 UTC. \ No newline at end of file diff --git a/text_content/models_experimental-models_a4b325ea.txt b/text_content/models_experimental-models_a4b325ea.txt new file mode 100644 index 0000000000000000000000000000000000000000..9ba58cef7be3ddff013eedbbf3819c262c9bdc63 --- /dev/null +++ b/text_content/models_experimental-models_a4b325ea.txt @@ -0,0 +1,5 @@ +URL: https://ai.google.dev/gemini-api/docs/models/experimental-models#gemini-2.5-flash +Title: Gemini models  |  Gemini API  |  Google AI for Developers +================================================== + +Gemini models | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Gemini models 2.5 Pro spark Our most powerful thinking model with maximum response accuracy and state-of-the-art performance Input audio, images, video, and text, get text responses Tackle difficult problems, analyze large databases, and more Best for complex coding, reasoning, and multimodal understanding 2.5 Flash spark Our best model in terms of price-performance, offering well-rounded capabilities. Input audio, images, video, and text, and get text responses Model thinks as needed; or, you can configure a thinking budget Best for low latency, high volume tasks that require thinking 2.5 Flash-Lite experiment A Gemini 2.5 Flash model optimized for cost efficiency and low latency. Input audio, images, video, and text, and get text responses Most cost-efficient model supporting high throughput Best for real time, low latency use cases Note: Gemini 2.5 Pro and 2.5 Flash come with thinking on by default . If you're migrating from a non-thinking model such as 2.0 Pro or Flash, we recommend you to review the Thinking guide first. Model variants The Gemini API offers different models that are optimized for specific use cases. Here's a brief overview of Gemini variants that are available: Model variant Input(s) Output Optimized for Gemini 2.5 Pro gemini-2.5-pro Audio, images, videos, text, and PDF Text Enhanced thinking and reasoning, multimodal understanding, advanced coding, and more Gemini 2.5 Flash gemini-2.5-flash Audio, images, videos, and text Text Adaptive thinking, cost efficiency Gemini 2.5 Flash-Lite Preview gemini-2.5-flash-lite-preview-06-17 Text, image, video, audio Text Most cost-efficient model supporting high throughput Gemini 2.5 Flash Native Audio gemini-2.5-flash-preview-native-audio-dialog & gemini-2.5-flash-exp-native-audio-thinking-dialog Audio, videos, and text Text and audio, interleaved High quality, natural conversational audio outputs, with or without thinking Gemini 2.5 Flash Preview TTS gemini-2.5-flash-preview-tts Text Audio Low latency, controllable, single- and multi-speaker text-to-speech audio generation Gemini 2.5 Pro Preview TTS gemini-2.5-pro-preview-tts Text Audio Low latency, controllable, single- and multi-speaker text-to-speech audio generation Gemini 2.0 Flash gemini-2.0-flash Audio, images, videos, and text Text Next generation features, speed, and realtime streaming. Gemini 2.0 Flash Preview Image Generation gemini-2.0-flash-preview-image-generation Audio, images, videos, and text Text, images Conversational image generation and editing Gemini 2.0 Flash-Lite gemini-2.0-flash-lite Audio, images, videos, and text Text Cost efficiency and low latency Gemini 1.5 Flash gemini-1.5-flash Audio, images, videos, and text Text Fast and versatile performance across a diverse variety of tasks Gemini 1.5 Flash-8B gemini-1.5-flash-8b Audio, images, videos, and text Text High volume and lower intelligence tasks Gemini 1.5 Pro gemini-1.5-pro Audio, images, videos, and text Text Complex reasoning tasks requiring more intelligence Gemini Embedding gemini-embedding-exp Text Text embeddings Measuring the relatedness of text strings Imagen 4 imagen-4.0-generate-preview-06-06 imagen-4.0-ultra-generate-preview-06-06 Text Images Our most up-to-date image generation model Imagen 3 imagen-3.0-generate-002 Text Images High quality image generation model Veo 2 veo-2.0-generate-001 Text, images Video High quality video generation Gemini 2.5 Flash Live gemini-live-2.5-flash-preview Audio, video, and text Text, audio Low-latency bidirectional voice and video interactions Gemini 2.0 Flash Live gemini-2.0-flash-live-001 Audio, video, and text Text, audio Low-latency bidirectional voice and video interactions You can view the rate limits for each model on the rate limits page . Gemini 2.5 Pro Gemini 2.5 Pro is our state-of-the-art thinking model, capable of reasoning over complex problems in code, math, and STEM, as well as analyzing large datasets, codebases, and documents using long context. Try in Google AI Studio Model details Property Description id_card Model code gemini-2.5-pro save Supported data types Inputs Audio, images, video, text, and PDF Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 65,536 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Search grounding Supported Image generation Not supported Audio generation Not supported Live API Not supported Thinking Supported Batch API Supported 123 Versions Read the model version patterns for more details. Stable: gemini-2.5-pro Preview: gemini-2.5-pro-preview-06-05 Preview: gemini-2.5-pro-preview-05-06 Preview: gemini-2.5-pro-preview-03-25 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Our best model in terms of price-performance, offering well-rounded capabilities. 2.5 Flash is best for large scale processing, low-latency, high volume tasks that require thinking, and agentic use cases. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash save Supported data types Inputs Text, images, video, audio Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 65,536 handyman Capabilities Audio generation Not supported Caching Supported Code execution Supported Function calling Supported Image generation Not supported Search grounding Supported Structured outputs Supported Thinking Supported Tuning Not supported Batch API Supported 123 Versions Read the model version patterns for more details. Stable: gemini-2.5-flash Preview: gemini-2.5-flash-preview-05-20 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash-Lite Preview A Gemini 2.5 Flash model optimized for cost efficiency and low latency. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-lite-preview-06-17 save Supported data types Inputs Text, images, video, and audio Output Text token_auto Token limits [*] Input token limit 1,000,000 Output token limit 64,000 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported URL Context Supported Search grounding Supported Image generation Not supported Audio generation Not supported Live API Not supported Thinking Supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.5-flash-lite-preview-06-17 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Native Audio Our native audio dialog models, with and without thinking, available through the Live API . These models provide interactive and unstructured conversational experiences, with style and control prompting. Try native audio in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-preview-native-audio-dialog & models/gemini-2.5-flash-exp-native-audio-thinking-dialog save Supported data types Inputs Audio, video, text Output Audio and text token_auto Token limits [*] Input token limit 128,000 Output token limit 8,000 handyman Capabilities Audio generation Supported Caching Not supported Code execution Not supported Function calling Supported Image generation Not supported Search grounding Supported Structured outputs Not supported Thinking Supported Tuning Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.5-flash-preview-05-20 Experimental: gemini-2.5-flash-exp-native-audio-thinking-dialog calendar_month Latest update May 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Preview Text-to-Speech Gemini 2.5 Flash Preview TTS is our price-performant text-to-speech model, delivering high control and transparency for structured workflows like podcast generation, audiobooks, customer support, and more. Gemini 2.5 Flash rate limits are more restricted since it is an experimental / preview model. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-preview-tts save Supported data types Inputs Text Output Audio token_auto Token limits [*] Input token limit 8,000 Output token limit 16,000 handyman Capabilities Structured outputs Not supported Caching Not supported Tuning Not supported Function calling Not supported Code execution Not supported Search Not supported Audio generation Supported Live API Not supported Thinking Not supported 123 Versions Read the model version patterns for more details. gemini-2.5-flash-preview-tts calendar_month Latest update May 2025 Gemini 2.5 Pro Preview Text-to-Speech Gemini 2.5 Pro Preview TTS is our most powerful text-to-speech model, delivering high control and transparency for structured workflows like podcast generation, audiobooks, customer support, and more. Gemini 2.5 Pro rate limits are more restricted since it is an experimental / preview model. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-pro-preview-tts save Supported data types Inputs Text Output Audio token_auto Token limits [*] Input token limit 8,000 Output token limit 16,000 handyman Capabilities Structured outputs Not supported Caching Not supported Tuning Not supported Function calling Not supported Code execution Not supported Search Not supported Audio generation Supported Live API Not supported Thinking Not supported 123 Versions Read the model version patterns for more details. gemini-2.5-pro-preview-tts calendar_month Latest update May 2025 Gemini 2.0 Flash Gemini 2.0 Flash delivers next-gen features and improved capabilities, including superior speed, native tool use, and a 1M token context window. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Not supported Live API Supported Thinking Experimental Batch API Supported 123 Versions Read the model version patterns for more details. Latest: gemini-2.0-flash Stable: gemini-2.0-flash-001 Experimental: gemini-2.0-flash-exp calendar_month Latest update February 2025 cognition_2 Knowledge cutoff August 2024 Gemini 2.0 Flash Preview Image Generation Gemini 2.0 Flash Preview Image Generation delivers improved image generation features, including generating and editing images conversationally. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-preview-image-generation save Supported data types Inputs Audio, images, video, and text Output Text and images token_auto Token limits [*] Input token limit 32,000 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Not supported Code execution Not Supported Search Not Supported Image generation Supported Audio generation Not supported Live API Not Supported Thinking Not Supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.0-flash-preview-image-generation gemini-2.0-flash-preview-image-generation is not currently supported in a number of countries in Europe, Middle East & Africa calendar_month Latest update May 2025 cognition_2 Knowledge cutoff August 2024 Gemini 2.0 Flash-Lite A Gemini 2.0 Flash model optimized for cost efficiency and low latency. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-lite save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Not supported Search Not supported Image generation Not supported Audio generation Not supported Live API Not supported Batch API Supported 123 Versions Read the model version patterns for more details. Latest: gemini-2.0-flash-lite Stable: gemini-2.0-flash-lite-001 calendar_month Latest update February 2025 cognition_2 Knowledge cutoff August 2024 Gemini 1.5 Flash Gemini 1.5 Flash is a fast and versatile multimodal model for scaling across diverse tasks. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-flash save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 3,600 Maximum video length 1 hour Maximum audio length Approximately 9.5 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-flash-latest Latest stable: gemini-1.5-flash Stable: gemini-1.5-flash-001 gemini-1.5-flash-002 calendar_month Latest update September 2024 Gemini 1.5 Flash-8B Gemini 1.5 Flash-8B is a small model designed for lower intelligence tasks. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-flash-8b save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 3,600 Maximum video length 1 hour Maximum audio length Approximately 9.5 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-flash-8b-latest Latest stable: gemini-1.5-flash-8b Stable: gemini-1.5-flash-8b-001 calendar_month Latest update October 2024 Gemini 1.5 Pro Try Gemini 2.5 Pro Preview , our most advanced Gemini model to date. Gemini 1.5 Pro is a mid-size multimodal model that is optimized for a wide-range of reasoning tasks. 1.5 Pro can process large amounts of data at once, including 2 hours of video, 19 hours of audio, codebases with 60,000 lines of code, or 2,000 pages of text. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-pro save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 2,097,152 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 7,200 Maximum video length 2 hours Maximum audio length Approximately 19 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-pro-latest Latest stable: gemini-1.5-pro Stable: gemini-1.5-pro-001 gemini-1.5-pro-002 calendar_month Latest update September 2024 Imagen 4 Imagen 4 is our latest image model, capable of generating highly detailed images with rich lighting, significantly better text rendering, and higher resolution output than previous models. Model details Property Description id_card Model code Gemini API imagen-4.0-generate-preview-06-06 imagen-4.0-ultra-generate-preview-06-06 save Supported data types Input Text Output Images token_auto Token limits [*] Input token limit 480 tokens (text) Output images 1 (Ultra) 1 to 4 (Standard) calendar_month Latest update June 2025 Imagen 3 Imagen 3 is our highest quality text-to-image model, capable of generating images with even better detail, richer lighting and fewer distracting artifacts than our previous models. Model details Property Description id_card Model code Gemini API imagen-3.0-generate-002 save Supported data types Input Text Output Images token_auto Token limits [*] Input token limit N/A Output images Up to 4 calendar_month Latest update February 2025 Veo 2 Veo 2 is our high quality text- and image-to-video model, capable of generating detailed videos, capturing the artistic nuance in your prompts. Model details Property Description id_card Model code Gemini API veo-2.0-generate-001 save Supported data types Input Text, image Output Video token_auto Limits Text input N/A Image input Any image resolution and aspect ratio up to 20MB file size Output video Up to 2 calendar_month Latest update April 2025 Gemini 2.5 Flash Live The Gemini 2.5 Flash Live model works with the Live API to enable low-latency bidirectional voice and video interactions with Gemini. The model can process text, audio, and video input, and it can provide text and audio output. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-live-2.5-flash-preview save Supported data types Inputs Audio, video, and text Output Text, and audio token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Supported Thinking Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-live-2.5-flash-preview calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.0 Flash Live The Gemini 2.0 Flash Live model works with the Live API to enable low-latency bidirectional voice and video interactions with Gemini. The model can process text, audio, and video input, and it can provide text and audio output. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-live-001 save Supported data types Inputs Audio, video, and text Output Text, and audio token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Supported Thinking Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.0-flash-live-001 calendar_month Latest update April 2025 cognition_2 Knowledge cutoff August 2024 Gemini Embedding Experimental Gemini embedding achieves a SOTA performance across many key dimensions including code, multi-lingual, and retrieval. Gemini Embedding rate limits are more restricted since it is an experimental model. Model details Property Description id_card Model code Gemini API gemini-embedding-exp-03-07 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 8,192 Output dimension size Elastic, supports: 3072, 1536, or 768 calendar_month Latest update March 2025 Text Embedding and Embedding Text Embedding Try our new experimental Gemini embedding model which achieves state-of-the-art performance. Text embeddings are used to measure the relatedness of strings and are widely used in many AI applications. text-embedding-004 achieves a stronger retrieval performance and outperforms existing models with comparable dimensions, on the standard MTEB embedding benchmarks. Model details Property Description id_card Model code Gemini API models/text-embedding-004 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 2,048 Output dimension size 768 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Not supported calendar_month Latest update April 2024 Embedding Note: Text Embedding is the newer version of the Embedding model. If you're creating a new project, use Text Embedding. You can use the Embedding model to generate text embeddings for input text. The Embedding model is optimized for creating embeddings with 768 dimensions for text of up to 2,048 tokens. Embedding model details Property Description id_card Model code models/embedding-001 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 2,048 Output dimension size 768 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Not supported calendar_month Latest update December 2023 AQA You can use the AQA model to perform Attributed Question-Answering (AQA)–related tasks over a document, corpus, or a set of passages. The AQA model returns answers to questions that are grounded in provided sources, along with estimating answerable probability. Model details Property Description id_card Model code models/aqa save Supported data types Input Text Output Text language Supported language English token_auto Token limits [*] Input token limit 7,168 Output token limit 1,024 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Supported calendar_month Latest update December 2023 See the examples to explore the capabilities of these model variations. [*] A token is equivalent to about 4 characters for Gemini models. 100 tokens are about 60-80 English words. Model version name patterns Gemini models are available in either stable , preview , or experimental versions. In your code, you can use one of the following model name formats to specify which model and version you want to use. Latest stable Points to the most recent stable version released for the specified model generation and variation. To specify the latest stable version, use the following pattern: -- . For example, gemini-2.0-flash . Stable Points to a specific stable model. Stable models usually don't change. Most production apps should use a specific stable model. To specify a stable version, use the following pattern: --- . For example, gemini-2.0-flash-001 . Preview Points to a preview model which may not be suitable for production use, come with more restrictive rate limits, but may have billing enabled. To specify a preview version, use the following pattern: --- . For example, gemini-2.5-pro-preview-06-05 . Experimental Points to an experimental model which may not be suitable for production use and come with more restrictive rate limits. We release experimental models to gather feedback and get our latest updates into the hands of developers quickly. To specify an experimental version, use the following pattern: --- . For example, gemini-2.0-pro-exp-02-05 . Experimental models In addition to stable models, the Gemini API offers experimental models which may not be suitable for production use and come with more restrictive rate limits. We release experimental models to gather feedback, get our latest updates into the hands of developers quickly, and highlight the pace of innovation happening at Google. What we learn from experimental launches informs how we release models more widely. An experimental model can be swapped for another without prior notice. We don't guarantee that an experimental model will become a stable model in the future. Previous experimental models As new versions or stable releases become available, we remove and replace experimental models. You can find the previous experimental models we released in the following section along with the replacement version: Model code Base model Replacement version gemini-2.5-flash-preview-04-17 Gemini 2.5 Flash gemini-2.5-flash-preview-05-20 gemini-2.0-flash-exp-image-generation Gemini 2.0 Flash gemini-2.0-flash-preview-image-generation gemini-2.5-pro-preview-06-05 Gemini 2.5 Pro gemini-2.5-pro gemini-2.5-pro-preview-05-06 Gemini 2.5 Pro gemini-2.5-pro gemini-2.5-pro-preview-03-25 Gemini 2.5 Pro gemini-2.5-pro gemini-2.0-flash-thinking-exp-01-21 Gemini 2.5 Flash gemini-2.5-flash-preview-04-17 gemini-2.0-pro-exp-02-05 Gemini 2.0 Pro Experimental gemini-2.5-pro-preview-03-25 gemini-2.0-flash-exp Gemini 2.0 Flash gemini-2.0-flash gemini-exp-1206 Gemini 2.0 Pro gemini-2.0-pro-exp-02-05 gemini-2.0-flash-thinking-exp-1219 Gemini 2.0 Flash Thinking gemini-2.0-flash-thinking-exp-01-21 gemini-exp-1121 Gemini gemini-exp-1206 gemini-exp-1114 Gemini gemini-exp-1206 gemini-1.5-pro-exp-0827 Gemini 1.5 Pro gemini-exp-1206 gemini-1.5-pro-exp-0801 Gemini 1.5 Pro gemini-exp-1206 gemini-1.5-flash-8b-exp-0924 Gemini 1.5 Flash-8B gemini-1.5-flash-8b gemini-1.5-flash-8b-exp-0827 Gemini 1.5 Flash-8B gemini-1.5-flash-8b Supported languages Gemini models are trained to work with the following languages: Arabic ( ar ) Bengali ( bn ) Bulgarian ( bg ) Chinese simplified and traditional ( zh ) Croatian ( hr ) Czech ( cs ) Danish ( da ) Dutch ( nl ) English ( en ) Estonian ( et ) Finnish ( fi ) French ( fr ) German ( de ) Greek ( el ) Hebrew ( iw ) Hindi ( hi ) Hungarian ( hu ) Indonesian ( id ) Italian ( it ) Japanese ( ja ) Korean ( ko ) Latvian ( lv ) Lithuanian ( lt ) Norwegian ( no ) Polish ( pl ) Portuguese ( pt ) Romanian ( ro ) Russian ( ru ) Serbian ( sr ) Slovak ( sk ) Slovenian ( sl ) Spanish ( es ) Swahili ( sw ) Swedish ( sv ) Thai ( th ) Turkish ( tr ) Ukrainian ( uk ) Vietnamese ( vi ) Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-07 UTC. \ No newline at end of file diff --git a/text_content/models_gemini_04ac0f15.txt b/text_content/models_gemini_04ac0f15.txt new file mode 100644 index 0000000000000000000000000000000000000000..01719cb7866bf7b099214399edaa3df23ec18134 --- /dev/null +++ b/text_content/models_gemini_04ac0f15.txt @@ -0,0 +1,5 @@ +URL: https://ai.google.dev/gemini-api/docs/models/gemini#gemini-2.5-flash +Title: Gemini models  |  Gemini API  |  Google AI for Developers +================================================== + +Gemini models | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Gemini models 2.5 Pro spark Our most powerful thinking model with maximum response accuracy and state-of-the-art performance Input audio, images, video, and text, get text responses Tackle difficult problems, analyze large databases, and more Best for complex coding, reasoning, and multimodal understanding 2.5 Flash spark Our best model in terms of price-performance, offering well-rounded capabilities. Input audio, images, video, and text, and get text responses Model thinks as needed; or, you can configure a thinking budget Best for low latency, high volume tasks that require thinking 2.5 Flash-Lite experiment A Gemini 2.5 Flash model optimized for cost efficiency and low latency. Input audio, images, video, and text, and get text responses Most cost-efficient model supporting high throughput Best for real time, low latency use cases Note: Gemini 2.5 Pro and 2.5 Flash come with thinking on by default . If you're migrating from a non-thinking model such as 2.0 Pro or Flash, we recommend you to review the Thinking guide first. Model variants The Gemini API offers different models that are optimized for specific use cases. Here's a brief overview of Gemini variants that are available: Model variant Input(s) Output Optimized for Gemini 2.5 Pro gemini-2.5-pro Audio, images, videos, text, and PDF Text Enhanced thinking and reasoning, multimodal understanding, advanced coding, and more Gemini 2.5 Flash gemini-2.5-flash Audio, images, videos, and text Text Adaptive thinking, cost efficiency Gemini 2.5 Flash-Lite Preview gemini-2.5-flash-lite-preview-06-17 Text, image, video, audio Text Most cost-efficient model supporting high throughput Gemini 2.5 Flash Native Audio gemini-2.5-flash-preview-native-audio-dialog & gemini-2.5-flash-exp-native-audio-thinking-dialog Audio, videos, and text Text and audio, interleaved High quality, natural conversational audio outputs, with or without thinking Gemini 2.5 Flash Preview TTS gemini-2.5-flash-preview-tts Text Audio Low latency, controllable, single- and multi-speaker text-to-speech audio generation Gemini 2.5 Pro Preview TTS gemini-2.5-pro-preview-tts Text Audio Low latency, controllable, single- and multi-speaker text-to-speech audio generation Gemini 2.0 Flash gemini-2.0-flash Audio, images, videos, and text Text Next generation features, speed, and realtime streaming. Gemini 2.0 Flash Preview Image Generation gemini-2.0-flash-preview-image-generation Audio, images, videos, and text Text, images Conversational image generation and editing Gemini 2.0 Flash-Lite gemini-2.0-flash-lite Audio, images, videos, and text Text Cost efficiency and low latency Gemini 1.5 Flash gemini-1.5-flash Audio, images, videos, and text Text Fast and versatile performance across a diverse variety of tasks Gemini 1.5 Flash-8B gemini-1.5-flash-8b Audio, images, videos, and text Text High volume and lower intelligence tasks Gemini 1.5 Pro gemini-1.5-pro Audio, images, videos, and text Text Complex reasoning tasks requiring more intelligence Gemini Embedding gemini-embedding-exp Text Text embeddings Measuring the relatedness of text strings Imagen 4 imagen-4.0-generate-preview-06-06 imagen-4.0-ultra-generate-preview-06-06 Text Images Our most up-to-date image generation model Imagen 3 imagen-3.0-generate-002 Text Images High quality image generation model Veo 2 veo-2.0-generate-001 Text, images Video High quality video generation Gemini 2.5 Flash Live gemini-live-2.5-flash-preview Audio, video, and text Text, audio Low-latency bidirectional voice and video interactions Gemini 2.0 Flash Live gemini-2.0-flash-live-001 Audio, video, and text Text, audio Low-latency bidirectional voice and video interactions You can view the rate limits for each model on the rate limits page . Gemini 2.5 Pro Gemini 2.5 Pro is our state-of-the-art thinking model, capable of reasoning over complex problems in code, math, and STEM, as well as analyzing large datasets, codebases, and documents using long context. Try in Google AI Studio Model details Property Description id_card Model code gemini-2.5-pro save Supported data types Inputs Audio, images, video, text, and PDF Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 65,536 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Search grounding Supported Image generation Not supported Audio generation Not supported Live API Not supported Thinking Supported Batch API Supported 123 Versions Read the model version patterns for more details. Stable: gemini-2.5-pro Preview: gemini-2.5-pro-preview-06-05 Preview: gemini-2.5-pro-preview-05-06 Preview: gemini-2.5-pro-preview-03-25 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Our best model in terms of price-performance, offering well-rounded capabilities. 2.5 Flash is best for large scale processing, low-latency, high volume tasks that require thinking, and agentic use cases. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash save Supported data types Inputs Text, images, video, audio Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 65,536 handyman Capabilities Audio generation Not supported Caching Supported Code execution Supported Function calling Supported Image generation Not supported Search grounding Supported Structured outputs Supported Thinking Supported Tuning Not supported Batch API Supported 123 Versions Read the model version patterns for more details. Stable: gemini-2.5-flash Preview: gemini-2.5-flash-preview-05-20 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash-Lite Preview A Gemini 2.5 Flash model optimized for cost efficiency and low latency. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-lite-preview-06-17 save Supported data types Inputs Text, images, video, and audio Output Text token_auto Token limits [*] Input token limit 1,000,000 Output token limit 64,000 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported URL Context Supported Search grounding Supported Image generation Not supported Audio generation Not supported Live API Not supported Thinking Supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.5-flash-lite-preview-06-17 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Native Audio Our native audio dialog models, with and without thinking, available through the Live API . These models provide interactive and unstructured conversational experiences, with style and control prompting. Try native audio in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-preview-native-audio-dialog & models/gemini-2.5-flash-exp-native-audio-thinking-dialog save Supported data types Inputs Audio, video, text Output Audio and text token_auto Token limits [*] Input token limit 128,000 Output token limit 8,000 handyman Capabilities Audio generation Supported Caching Not supported Code execution Not supported Function calling Supported Image generation Not supported Search grounding Supported Structured outputs Not supported Thinking Supported Tuning Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.5-flash-preview-05-20 Experimental: gemini-2.5-flash-exp-native-audio-thinking-dialog calendar_month Latest update May 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Preview Text-to-Speech Gemini 2.5 Flash Preview TTS is our price-performant text-to-speech model, delivering high control and transparency for structured workflows like podcast generation, audiobooks, customer support, and more. Gemini 2.5 Flash rate limits are more restricted since it is an experimental / preview model. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-preview-tts save Supported data types Inputs Text Output Audio token_auto Token limits [*] Input token limit 8,000 Output token limit 16,000 handyman Capabilities Structured outputs Not supported Caching Not supported Tuning Not supported Function calling Not supported Code execution Not supported Search Not supported Audio generation Supported Live API Not supported Thinking Not supported 123 Versions Read the model version patterns for more details. gemini-2.5-flash-preview-tts calendar_month Latest update May 2025 Gemini 2.5 Pro Preview Text-to-Speech Gemini 2.5 Pro Preview TTS is our most powerful text-to-speech model, delivering high control and transparency for structured workflows like podcast generation, audiobooks, customer support, and more. Gemini 2.5 Pro rate limits are more restricted since it is an experimental / preview model. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-pro-preview-tts save Supported data types Inputs Text Output Audio token_auto Token limits [*] Input token limit 8,000 Output token limit 16,000 handyman Capabilities Structured outputs Not supported Caching Not supported Tuning Not supported Function calling Not supported Code execution Not supported Search Not supported Audio generation Supported Live API Not supported Thinking Not supported 123 Versions Read the model version patterns for more details. gemini-2.5-pro-preview-tts calendar_month Latest update May 2025 Gemini 2.0 Flash Gemini 2.0 Flash delivers next-gen features and improved capabilities, including superior speed, native tool use, and a 1M token context window. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Not supported Live API Supported Thinking Experimental Batch API Supported 123 Versions Read the model version patterns for more details. Latest: gemini-2.0-flash Stable: gemini-2.0-flash-001 Experimental: gemini-2.0-flash-exp calendar_month Latest update February 2025 cognition_2 Knowledge cutoff August 2024 Gemini 2.0 Flash Preview Image Generation Gemini 2.0 Flash Preview Image Generation delivers improved image generation features, including generating and editing images conversationally. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-preview-image-generation save Supported data types Inputs Audio, images, video, and text Output Text and images token_auto Token limits [*] Input token limit 32,000 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Not supported Code execution Not Supported Search Not Supported Image generation Supported Audio generation Not supported Live API Not Supported Thinking Not Supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.0-flash-preview-image-generation gemini-2.0-flash-preview-image-generation is not currently supported in a number of countries in Europe, Middle East & Africa calendar_month Latest update May 2025 cognition_2 Knowledge cutoff August 2024 Gemini 2.0 Flash-Lite A Gemini 2.0 Flash model optimized for cost efficiency and low latency. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-lite save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Not supported Search Not supported Image generation Not supported Audio generation Not supported Live API Not supported Batch API Supported 123 Versions Read the model version patterns for more details. Latest: gemini-2.0-flash-lite Stable: gemini-2.0-flash-lite-001 calendar_month Latest update February 2025 cognition_2 Knowledge cutoff August 2024 Gemini 1.5 Flash Gemini 1.5 Flash is a fast and versatile multimodal model for scaling across diverse tasks. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-flash save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 3,600 Maximum video length 1 hour Maximum audio length Approximately 9.5 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-flash-latest Latest stable: gemini-1.5-flash Stable: gemini-1.5-flash-001 gemini-1.5-flash-002 calendar_month Latest update September 2024 Gemini 1.5 Flash-8B Gemini 1.5 Flash-8B is a small model designed for lower intelligence tasks. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-flash-8b save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 3,600 Maximum video length 1 hour Maximum audio length Approximately 9.5 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-flash-8b-latest Latest stable: gemini-1.5-flash-8b Stable: gemini-1.5-flash-8b-001 calendar_month Latest update October 2024 Gemini 1.5 Pro Try Gemini 2.5 Pro Preview , our most advanced Gemini model to date. Gemini 1.5 Pro is a mid-size multimodal model that is optimized for a wide-range of reasoning tasks. 1.5 Pro can process large amounts of data at once, including 2 hours of video, 19 hours of audio, codebases with 60,000 lines of code, or 2,000 pages of text. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-pro save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 2,097,152 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 7,200 Maximum video length 2 hours Maximum audio length Approximately 19 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-pro-latest Latest stable: gemini-1.5-pro Stable: gemini-1.5-pro-001 gemini-1.5-pro-002 calendar_month Latest update September 2024 Imagen 4 Imagen 4 is our latest image model, capable of generating highly detailed images with rich lighting, significantly better text rendering, and higher resolution output than previous models. Model details Property Description id_card Model code Gemini API imagen-4.0-generate-preview-06-06 imagen-4.0-ultra-generate-preview-06-06 save Supported data types Input Text Output Images token_auto Token limits [*] Input token limit 480 tokens (text) Output images 1 (Ultra) 1 to 4 (Standard) calendar_month Latest update June 2025 Imagen 3 Imagen 3 is our highest quality text-to-image model, capable of generating images with even better detail, richer lighting and fewer distracting artifacts than our previous models. Model details Property Description id_card Model code Gemini API imagen-3.0-generate-002 save Supported data types Input Text Output Images token_auto Token limits [*] Input token limit N/A Output images Up to 4 calendar_month Latest update February 2025 Veo 2 Veo 2 is our high quality text- and image-to-video model, capable of generating detailed videos, capturing the artistic nuance in your prompts. Model details Property Description id_card Model code Gemini API veo-2.0-generate-001 save Supported data types Input Text, image Output Video token_auto Limits Text input N/A Image input Any image resolution and aspect ratio up to 20MB file size Output video Up to 2 calendar_month Latest update April 2025 Gemini 2.5 Flash Live The Gemini 2.5 Flash Live model works with the Live API to enable low-latency bidirectional voice and video interactions with Gemini. The model can process text, audio, and video input, and it can provide text and audio output. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-live-2.5-flash-preview save Supported data types Inputs Audio, video, and text Output Text, and audio token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Supported Thinking Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-live-2.5-flash-preview calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.0 Flash Live The Gemini 2.0 Flash Live model works with the Live API to enable low-latency bidirectional voice and video interactions with Gemini. The model can process text, audio, and video input, and it can provide text and audio output. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-live-001 save Supported data types Inputs Audio, video, and text Output Text, and audio token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Supported Thinking Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.0-flash-live-001 calendar_month Latest update April 2025 cognition_2 Knowledge cutoff August 2024 Gemini Embedding Experimental Gemini embedding achieves a SOTA performance across many key dimensions including code, multi-lingual, and retrieval. Gemini Embedding rate limits are more restricted since it is an experimental model. Model details Property Description id_card Model code Gemini API gemini-embedding-exp-03-07 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 8,192 Output dimension size Elastic, supports: 3072, 1536, or 768 calendar_month Latest update March 2025 Text Embedding and Embedding Text Embedding Try our new experimental Gemini embedding model which achieves state-of-the-art performance. Text embeddings are used to measure the relatedness of strings and are widely used in many AI applications. text-embedding-004 achieves a stronger retrieval performance and outperforms existing models with comparable dimensions, on the standard MTEB embedding benchmarks. Model details Property Description id_card Model code Gemini API models/text-embedding-004 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 2,048 Output dimension size 768 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Not supported calendar_month Latest update April 2024 Embedding Note: Text Embedding is the newer version of the Embedding model. If you're creating a new project, use Text Embedding. You can use the Embedding model to generate text embeddings for input text. The Embedding model is optimized for creating embeddings with 768 dimensions for text of up to 2,048 tokens. Embedding model details Property Description id_card Model code models/embedding-001 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 2,048 Output dimension size 768 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Not supported calendar_month Latest update December 2023 AQA You can use the AQA model to perform Attributed Question-Answering (AQA)–related tasks over a document, corpus, or a set of passages. The AQA model returns answers to questions that are grounded in provided sources, along with estimating answerable probability. Model details Property Description id_card Model code models/aqa save Supported data types Input Text Output Text language Supported language English token_auto Token limits [*] Input token limit 7,168 Output token limit 1,024 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Supported calendar_month Latest update December 2023 See the examples to explore the capabilities of these model variations. [*] A token is equivalent to about 4 characters for Gemini models. 100 tokens are about 60-80 English words. Model version name patterns Gemini models are available in either stable , preview , or experimental versions. In your code, you can use one of the following model name formats to specify which model and version you want to use. Latest stable Points to the most recent stable version released for the specified model generation and variation. To specify the latest stable version, use the following pattern: -- . For example, gemini-2.0-flash . Stable Points to a specific stable model. Stable models usually don't change. Most production apps should use a specific stable model. To specify a stable version, use the following pattern: --- . For example, gemini-2.0-flash-001 . Preview Points to a preview model which may not be suitable for production use, come with more restrictive rate limits, but may have billing enabled. To specify a preview version, use the following pattern: --- . For example, gemini-2.5-pro-preview-06-05 . Experimental Points to an experimental model which may not be suitable for production use and come with more restrictive rate limits. We release experimental models to gather feedback and get our latest updates into the hands of developers quickly. To specify an experimental version, use the following pattern: --- . For example, gemini-2.0-pro-exp-02-05 . Experimental models In addition to stable models, the Gemini API offers experimental models which may not be suitable for production use and come with more restrictive rate limits. We release experimental models to gather feedback, get our latest updates into the hands of developers quickly, and highlight the pace of innovation happening at Google. What we learn from experimental launches informs how we release models more widely. An experimental model can be swapped for another without prior notice. We don't guarantee that an experimental model will become a stable model in the future. Previous experimental models As new versions or stable releases become available, we remove and replace experimental models. You can find the previous experimental models we released in the following section along with the replacement version: Model code Base model Replacement version gemini-2.5-flash-preview-04-17 Gemini 2.5 Flash gemini-2.5-flash-preview-05-20 gemini-2.0-flash-exp-image-generation Gemini 2.0 Flash gemini-2.0-flash-preview-image-generation gemini-2.5-pro-preview-06-05 Gemini 2.5 Pro gemini-2.5-pro gemini-2.5-pro-preview-05-06 Gemini 2.5 Pro gemini-2.5-pro gemini-2.5-pro-preview-03-25 Gemini 2.5 Pro gemini-2.5-pro gemini-2.0-flash-thinking-exp-01-21 Gemini 2.5 Flash gemini-2.5-flash-preview-04-17 gemini-2.0-pro-exp-02-05 Gemini 2.0 Pro Experimental gemini-2.5-pro-preview-03-25 gemini-2.0-flash-exp Gemini 2.0 Flash gemini-2.0-flash gemini-exp-1206 Gemini 2.0 Pro gemini-2.0-pro-exp-02-05 gemini-2.0-flash-thinking-exp-1219 Gemini 2.0 Flash Thinking gemini-2.0-flash-thinking-exp-01-21 gemini-exp-1121 Gemini gemini-exp-1206 gemini-exp-1114 Gemini gemini-exp-1206 gemini-1.5-pro-exp-0827 Gemini 1.5 Pro gemini-exp-1206 gemini-1.5-pro-exp-0801 Gemini 1.5 Pro gemini-exp-1206 gemini-1.5-flash-8b-exp-0924 Gemini 1.5 Flash-8B gemini-1.5-flash-8b gemini-1.5-flash-8b-exp-0827 Gemini 1.5 Flash-8B gemini-1.5-flash-8b Supported languages Gemini models are trained to work with the following languages: Arabic ( ar ) Bengali ( bn ) Bulgarian ( bg ) Chinese simplified and traditional ( zh ) Croatian ( hr ) Czech ( cs ) Danish ( da ) Dutch ( nl ) English ( en ) Estonian ( et ) Finnish ( fi ) French ( fr ) German ( de ) Greek ( el ) Hebrew ( iw ) Hindi ( hi ) Hungarian ( hu ) Indonesian ( id ) Italian ( it ) Japanese ( ja ) Korean ( ko ) Latvian ( lv ) Lithuanian ( lt ) Norwegian ( no ) Polish ( pl ) Portuguese ( pt ) Romanian ( ro ) Russian ( ru ) Serbian ( sr ) Slovak ( sk ) Slovenian ( sl ) Spanish ( es ) Swahili ( sw ) Swedish ( sv ) Thai ( th ) Turkish ( tr ) Ukrainian ( uk ) Vietnamese ( vi ) Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-07 UTC. \ No newline at end of file diff --git a/text_content/models_gemini_059a81c3.txt b/text_content/models_gemini_059a81c3.txt new file mode 100644 index 0000000000000000000000000000000000000000..5b4d9bcd33e5d310c90f0ff9f0a665ad93de0411 --- /dev/null +++ b/text_content/models_gemini_059a81c3.txt @@ -0,0 +1,5 @@ +URL: https://ai.google.dev/gemini-api/docs/models/gemini#rate-limits +Title: Gemini models  |  Gemini API  |  Google AI for Developers +================================================== + +Gemini models | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Gemini models 2.5 Pro spark Our most powerful thinking model with maximum response accuracy and state-of-the-art performance Input audio, images, video, and text, get text responses Tackle difficult problems, analyze large databases, and more Best for complex coding, reasoning, and multimodal understanding 2.5 Flash spark Our best model in terms of price-performance, offering well-rounded capabilities. Input audio, images, video, and text, and get text responses Model thinks as needed; or, you can configure a thinking budget Best for low latency, high volume tasks that require thinking 2.5 Flash-Lite experiment A Gemini 2.5 Flash model optimized for cost efficiency and low latency. Input audio, images, video, and text, and get text responses Most cost-efficient model supporting high throughput Best for real time, low latency use cases Note: Gemini 2.5 Pro and 2.5 Flash come with thinking on by default . If you're migrating from a non-thinking model such as 2.0 Pro or Flash, we recommend you to review the Thinking guide first. Model variants The Gemini API offers different models that are optimized for specific use cases. Here's a brief overview of Gemini variants that are available: Model variant Input(s) Output Optimized for Gemini 2.5 Pro gemini-2.5-pro Audio, images, videos, text, and PDF Text Enhanced thinking and reasoning, multimodal understanding, advanced coding, and more Gemini 2.5 Flash gemini-2.5-flash Audio, images, videos, and text Text Adaptive thinking, cost efficiency Gemini 2.5 Flash-Lite Preview gemini-2.5-flash-lite-preview-06-17 Text, image, video, audio Text Most cost-efficient model supporting high throughput Gemini 2.5 Flash Native Audio gemini-2.5-flash-preview-native-audio-dialog & gemini-2.5-flash-exp-native-audio-thinking-dialog Audio, videos, and text Text and audio, interleaved High quality, natural conversational audio outputs, with or without thinking Gemini 2.5 Flash Preview TTS gemini-2.5-flash-preview-tts Text Audio Low latency, controllable, single- and multi-speaker text-to-speech audio generation Gemini 2.5 Pro Preview TTS gemini-2.5-pro-preview-tts Text Audio Low latency, controllable, single- and multi-speaker text-to-speech audio generation Gemini 2.0 Flash gemini-2.0-flash Audio, images, videos, and text Text Next generation features, speed, and realtime streaming. Gemini 2.0 Flash Preview Image Generation gemini-2.0-flash-preview-image-generation Audio, images, videos, and text Text, images Conversational image generation and editing Gemini 2.0 Flash-Lite gemini-2.0-flash-lite Audio, images, videos, and text Text Cost efficiency and low latency Gemini 1.5 Flash gemini-1.5-flash Audio, images, videos, and text Text Fast and versatile performance across a diverse variety of tasks Gemini 1.5 Flash-8B gemini-1.5-flash-8b Audio, images, videos, and text Text High volume and lower intelligence tasks Gemini 1.5 Pro gemini-1.5-pro Audio, images, videos, and text Text Complex reasoning tasks requiring more intelligence Gemini Embedding gemini-embedding-exp Text Text embeddings Measuring the relatedness of text strings Imagen 4 imagen-4.0-generate-preview-06-06 imagen-4.0-ultra-generate-preview-06-06 Text Images Our most up-to-date image generation model Imagen 3 imagen-3.0-generate-002 Text Images High quality image generation model Veo 2 veo-2.0-generate-001 Text, images Video High quality video generation Gemini 2.5 Flash Live gemini-live-2.5-flash-preview Audio, video, and text Text, audio Low-latency bidirectional voice and video interactions Gemini 2.0 Flash Live gemini-2.0-flash-live-001 Audio, video, and text Text, audio Low-latency bidirectional voice and video interactions You can view the rate limits for each model on the rate limits page . Gemini 2.5 Pro Gemini 2.5 Pro is our state-of-the-art thinking model, capable of reasoning over complex problems in code, math, and STEM, as well as analyzing large datasets, codebases, and documents using long context. Try in Google AI Studio Model details Property Description id_card Model code gemini-2.5-pro save Supported data types Inputs Audio, images, video, text, and PDF Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 65,536 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Search grounding Supported Image generation Not supported Audio generation Not supported Live API Not supported Thinking Supported Batch API Supported 123 Versions Read the model version patterns for more details. Stable: gemini-2.5-pro Preview: gemini-2.5-pro-preview-06-05 Preview: gemini-2.5-pro-preview-05-06 Preview: gemini-2.5-pro-preview-03-25 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Our best model in terms of price-performance, offering well-rounded capabilities. 2.5 Flash is best for large scale processing, low-latency, high volume tasks that require thinking, and agentic use cases. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash save Supported data types Inputs Text, images, video, audio Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 65,536 handyman Capabilities Audio generation Not supported Caching Supported Code execution Supported Function calling Supported Image generation Not supported Search grounding Supported Structured outputs Supported Thinking Supported Tuning Not supported Batch API Supported 123 Versions Read the model version patterns for more details. Stable: gemini-2.5-flash Preview: gemini-2.5-flash-preview-05-20 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash-Lite Preview A Gemini 2.5 Flash model optimized for cost efficiency and low latency. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-lite-preview-06-17 save Supported data types Inputs Text, images, video, and audio Output Text token_auto Token limits [*] Input token limit 1,000,000 Output token limit 64,000 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported URL Context Supported Search grounding Supported Image generation Not supported Audio generation Not supported Live API Not supported Thinking Supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.5-flash-lite-preview-06-17 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Native Audio Our native audio dialog models, with and without thinking, available through the Live API . These models provide interactive and unstructured conversational experiences, with style and control prompting. Try native audio in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-preview-native-audio-dialog & models/gemini-2.5-flash-exp-native-audio-thinking-dialog save Supported data types Inputs Audio, video, text Output Audio and text token_auto Token limits [*] Input token limit 128,000 Output token limit 8,000 handyman Capabilities Audio generation Supported Caching Not supported Code execution Not supported Function calling Supported Image generation Not supported Search grounding Supported Structured outputs Not supported Thinking Supported Tuning Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.5-flash-preview-05-20 Experimental: gemini-2.5-flash-exp-native-audio-thinking-dialog calendar_month Latest update May 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Preview Text-to-Speech Gemini 2.5 Flash Preview TTS is our price-performant text-to-speech model, delivering high control and transparency for structured workflows like podcast generation, audiobooks, customer support, and more. Gemini 2.5 Flash rate limits are more restricted since it is an experimental / preview model. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-preview-tts save Supported data types Inputs Text Output Audio token_auto Token limits [*] Input token limit 8,000 Output token limit 16,000 handyman Capabilities Structured outputs Not supported Caching Not supported Tuning Not supported Function calling Not supported Code execution Not supported Search Not supported Audio generation Supported Live API Not supported Thinking Not supported 123 Versions Read the model version patterns for more details. gemini-2.5-flash-preview-tts calendar_month Latest update May 2025 Gemini 2.5 Pro Preview Text-to-Speech Gemini 2.5 Pro Preview TTS is our most powerful text-to-speech model, delivering high control and transparency for structured workflows like podcast generation, audiobooks, customer support, and more. Gemini 2.5 Pro rate limits are more restricted since it is an experimental / preview model. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-pro-preview-tts save Supported data types Inputs Text Output Audio token_auto Token limits [*] Input token limit 8,000 Output token limit 16,000 handyman Capabilities Structured outputs Not supported Caching Not supported Tuning Not supported Function calling Not supported Code execution Not supported Search Not supported Audio generation Supported Live API Not supported Thinking Not supported 123 Versions Read the model version patterns for more details. gemini-2.5-pro-preview-tts calendar_month Latest update May 2025 Gemini 2.0 Flash Gemini 2.0 Flash delivers next-gen features and improved capabilities, including superior speed, native tool use, and a 1M token context window. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Not supported Live API Supported Thinking Experimental Batch API Supported 123 Versions Read the model version patterns for more details. Latest: gemini-2.0-flash Stable: gemini-2.0-flash-001 Experimental: gemini-2.0-flash-exp calendar_month Latest update February 2025 cognition_2 Knowledge cutoff August 2024 Gemini 2.0 Flash Preview Image Generation Gemini 2.0 Flash Preview Image Generation delivers improved image generation features, including generating and editing images conversationally. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-preview-image-generation save Supported data types Inputs Audio, images, video, and text Output Text and images token_auto Token limits [*] Input token limit 32,000 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Not supported Code execution Not Supported Search Not Supported Image generation Supported Audio generation Not supported Live API Not Supported Thinking Not Supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.0-flash-preview-image-generation gemini-2.0-flash-preview-image-generation is not currently supported in a number of countries in Europe, Middle East & Africa calendar_month Latest update May 2025 cognition_2 Knowledge cutoff August 2024 Gemini 2.0 Flash-Lite A Gemini 2.0 Flash model optimized for cost efficiency and low latency. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-lite save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Not supported Search Not supported Image generation Not supported Audio generation Not supported Live API Not supported Batch API Supported 123 Versions Read the model version patterns for more details. Latest: gemini-2.0-flash-lite Stable: gemini-2.0-flash-lite-001 calendar_month Latest update February 2025 cognition_2 Knowledge cutoff August 2024 Gemini 1.5 Flash Gemini 1.5 Flash is a fast and versatile multimodal model for scaling across diverse tasks. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-flash save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 3,600 Maximum video length 1 hour Maximum audio length Approximately 9.5 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-flash-latest Latest stable: gemini-1.5-flash Stable: gemini-1.5-flash-001 gemini-1.5-flash-002 calendar_month Latest update September 2024 Gemini 1.5 Flash-8B Gemini 1.5 Flash-8B is a small model designed for lower intelligence tasks. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-flash-8b save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 3,600 Maximum video length 1 hour Maximum audio length Approximately 9.5 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-flash-8b-latest Latest stable: gemini-1.5-flash-8b Stable: gemini-1.5-flash-8b-001 calendar_month Latest update October 2024 Gemini 1.5 Pro Try Gemini 2.5 Pro Preview , our most advanced Gemini model to date. Gemini 1.5 Pro is a mid-size multimodal model that is optimized for a wide-range of reasoning tasks. 1.5 Pro can process large amounts of data at once, including 2 hours of video, 19 hours of audio, codebases with 60,000 lines of code, or 2,000 pages of text. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-pro save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 2,097,152 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 7,200 Maximum video length 2 hours Maximum audio length Approximately 19 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-pro-latest Latest stable: gemini-1.5-pro Stable: gemini-1.5-pro-001 gemini-1.5-pro-002 calendar_month Latest update September 2024 Imagen 4 Imagen 4 is our latest image model, capable of generating highly detailed images with rich lighting, significantly better text rendering, and higher resolution output than previous models. Model details Property Description id_card Model code Gemini API imagen-4.0-generate-preview-06-06 imagen-4.0-ultra-generate-preview-06-06 save Supported data types Input Text Output Images token_auto Token limits [*] Input token limit 480 tokens (text) Output images 1 (Ultra) 1 to 4 (Standard) calendar_month Latest update June 2025 Imagen 3 Imagen 3 is our highest quality text-to-image model, capable of generating images with even better detail, richer lighting and fewer distracting artifacts than our previous models. Model details Property Description id_card Model code Gemini API imagen-3.0-generate-002 save Supported data types Input Text Output Images token_auto Token limits [*] Input token limit N/A Output images Up to 4 calendar_month Latest update February 2025 Veo 2 Veo 2 is our high quality text- and image-to-video model, capable of generating detailed videos, capturing the artistic nuance in your prompts. Model details Property Description id_card Model code Gemini API veo-2.0-generate-001 save Supported data types Input Text, image Output Video token_auto Limits Text input N/A Image input Any image resolution and aspect ratio up to 20MB file size Output video Up to 2 calendar_month Latest update April 2025 Gemini 2.5 Flash Live The Gemini 2.5 Flash Live model works with the Live API to enable low-latency bidirectional voice and video interactions with Gemini. The model can process text, audio, and video input, and it can provide text and audio output. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-live-2.5-flash-preview save Supported data types Inputs Audio, video, and text Output Text, and audio token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Supported Thinking Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-live-2.5-flash-preview calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.0 Flash Live The Gemini 2.0 Flash Live model works with the Live API to enable low-latency bidirectional voice and video interactions with Gemini. The model can process text, audio, and video input, and it can provide text and audio output. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-live-001 save Supported data types Inputs Audio, video, and text Output Text, and audio token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Supported Thinking Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.0-flash-live-001 calendar_month Latest update April 2025 cognition_2 Knowledge cutoff August 2024 Gemini Embedding Experimental Gemini embedding achieves a SOTA performance across many key dimensions including code, multi-lingual, and retrieval. Gemini Embedding rate limits are more restricted since it is an experimental model. Model details Property Description id_card Model code Gemini API gemini-embedding-exp-03-07 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 8,192 Output dimension size Elastic, supports: 3072, 1536, or 768 calendar_month Latest update March 2025 Text Embedding and Embedding Text Embedding Try our new experimental Gemini embedding model which achieves state-of-the-art performance. Text embeddings are used to measure the relatedness of strings and are widely used in many AI applications. text-embedding-004 achieves a stronger retrieval performance and outperforms existing models with comparable dimensions, on the standard MTEB embedding benchmarks. Model details Property Description id_card Model code Gemini API models/text-embedding-004 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 2,048 Output dimension size 768 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Not supported calendar_month Latest update April 2024 Embedding Note: Text Embedding is the newer version of the Embedding model. If you're creating a new project, use Text Embedding. You can use the Embedding model to generate text embeddings for input text. The Embedding model is optimized for creating embeddings with 768 dimensions for text of up to 2,048 tokens. Embedding model details Property Description id_card Model code models/embedding-001 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 2,048 Output dimension size 768 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Not supported calendar_month Latest update December 2023 AQA You can use the AQA model to perform Attributed Question-Answering (AQA)–related tasks over a document, corpus, or a set of passages. The AQA model returns answers to questions that are grounded in provided sources, along with estimating answerable probability. Model details Property Description id_card Model code models/aqa save Supported data types Input Text Output Text language Supported language English token_auto Token limits [*] Input token limit 7,168 Output token limit 1,024 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Supported calendar_month Latest update December 2023 See the examples to explore the capabilities of these model variations. [*] A token is equivalent to about 4 characters for Gemini models. 100 tokens are about 60-80 English words. Model version name patterns Gemini models are available in either stable , preview , or experimental versions. In your code, you can use one of the following model name formats to specify which model and version you want to use. Latest stable Points to the most recent stable version released for the specified model generation and variation. To specify the latest stable version, use the following pattern: -- . For example, gemini-2.0-flash . Stable Points to a specific stable model. Stable models usually don't change. Most production apps should use a specific stable model. To specify a stable version, use the following pattern: --- . For example, gemini-2.0-flash-001 . Preview Points to a preview model which may not be suitable for production use, come with more restrictive rate limits, but may have billing enabled. To specify a preview version, use the following pattern: --- . For example, gemini-2.5-pro-preview-06-05 . Experimental Points to an experimental model which may not be suitable for production use and come with more restrictive rate limits. We release experimental models to gather feedback and get our latest updates into the hands of developers quickly. To specify an experimental version, use the following pattern: --- . For example, gemini-2.0-pro-exp-02-05 . Experimental models In addition to stable models, the Gemini API offers experimental models which may not be suitable for production use and come with more restrictive rate limits. We release experimental models to gather feedback, get our latest updates into the hands of developers quickly, and highlight the pace of innovation happening at Google. What we learn from experimental launches informs how we release models more widely. An experimental model can be swapped for another without prior notice. We don't guarantee that an experimental model will become a stable model in the future. Previous experimental models As new versions or stable releases become available, we remove and replace experimental models. You can find the previous experimental models we released in the following section along with the replacement version: Model code Base model Replacement version gemini-2.5-flash-preview-04-17 Gemini 2.5 Flash gemini-2.5-flash-preview-05-20 gemini-2.0-flash-exp-image-generation Gemini 2.0 Flash gemini-2.0-flash-preview-image-generation gemini-2.5-pro-preview-06-05 Gemini 2.5 Pro gemini-2.5-pro gemini-2.5-pro-preview-05-06 Gemini 2.5 Pro gemini-2.5-pro gemini-2.5-pro-preview-03-25 Gemini 2.5 Pro gemini-2.5-pro gemini-2.0-flash-thinking-exp-01-21 Gemini 2.5 Flash gemini-2.5-flash-preview-04-17 gemini-2.0-pro-exp-02-05 Gemini 2.0 Pro Experimental gemini-2.5-pro-preview-03-25 gemini-2.0-flash-exp Gemini 2.0 Flash gemini-2.0-flash gemini-exp-1206 Gemini 2.0 Pro gemini-2.0-pro-exp-02-05 gemini-2.0-flash-thinking-exp-1219 Gemini 2.0 Flash Thinking gemini-2.0-flash-thinking-exp-01-21 gemini-exp-1121 Gemini gemini-exp-1206 gemini-exp-1114 Gemini gemini-exp-1206 gemini-1.5-pro-exp-0827 Gemini 1.5 Pro gemini-exp-1206 gemini-1.5-pro-exp-0801 Gemini 1.5 Pro gemini-exp-1206 gemini-1.5-flash-8b-exp-0924 Gemini 1.5 Flash-8B gemini-1.5-flash-8b gemini-1.5-flash-8b-exp-0827 Gemini 1.5 Flash-8B gemini-1.5-flash-8b Supported languages Gemini models are trained to work with the following languages: Arabic ( ar ) Bengali ( bn ) Bulgarian ( bg ) Chinese simplified and traditional ( zh ) Croatian ( hr ) Czech ( cs ) Danish ( da ) Dutch ( nl ) English ( en ) Estonian ( et ) Finnish ( fi ) French ( fr ) German ( de ) Greek ( el ) Hebrew ( iw ) Hindi ( hi ) Hungarian ( hu ) Indonesian ( id ) Italian ( it ) Japanese ( ja ) Korean ( ko ) Latvian ( lv ) Lithuanian ( lt ) Norwegian ( no ) Polish ( pl ) Portuguese ( pt ) Romanian ( ro ) Russian ( ru ) Serbian ( sr ) Slovak ( sk ) Slovenian ( sl ) Spanish ( es ) Swahili ( sw ) Swedish ( sv ) Thai ( th ) Turkish ( tr ) Ukrainian ( uk ) Vietnamese ( vi ) Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-07 UTC. \ No newline at end of file diff --git a/text_content/models_gemini_5b88f1fc.txt b/text_content/models_gemini_5b88f1fc.txt new file mode 100644 index 0000000000000000000000000000000000000000..bf6012631b5fb78b6b831b48f6a5c9cf32a20e3c --- /dev/null +++ b/text_content/models_gemini_5b88f1fc.txt @@ -0,0 +1,5 @@ +URL: https://ai.google.dev/gemini-api/docs/models/gemini#live-api +Title: Gemini models  |  Gemini API  |  Google AI for Developers +================================================== + +Gemini models | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Gemini models 2.5 Pro spark Our most powerful thinking model with maximum response accuracy and state-of-the-art performance Input audio, images, video, and text, get text responses Tackle difficult problems, analyze large databases, and more Best for complex coding, reasoning, and multimodal understanding 2.5 Flash spark Our best model in terms of price-performance, offering well-rounded capabilities. Input audio, images, video, and text, and get text responses Model thinks as needed; or, you can configure a thinking budget Best for low latency, high volume tasks that require thinking 2.5 Flash-Lite experiment A Gemini 2.5 Flash model optimized for cost efficiency and low latency. Input audio, images, video, and text, and get text responses Most cost-efficient model supporting high throughput Best for real time, low latency use cases Note: Gemini 2.5 Pro and 2.5 Flash come with thinking on by default . If you're migrating from a non-thinking model such as 2.0 Pro or Flash, we recommend you to review the Thinking guide first. Model variants The Gemini API offers different models that are optimized for specific use cases. Here's a brief overview of Gemini variants that are available: Model variant Input(s) Output Optimized for Gemini 2.5 Pro gemini-2.5-pro Audio, images, videos, text, and PDF Text Enhanced thinking and reasoning, multimodal understanding, advanced coding, and more Gemini 2.5 Flash gemini-2.5-flash Audio, images, videos, and text Text Adaptive thinking, cost efficiency Gemini 2.5 Flash-Lite Preview gemini-2.5-flash-lite-preview-06-17 Text, image, video, audio Text Most cost-efficient model supporting high throughput Gemini 2.5 Flash Native Audio gemini-2.5-flash-preview-native-audio-dialog & gemini-2.5-flash-exp-native-audio-thinking-dialog Audio, videos, and text Text and audio, interleaved High quality, natural conversational audio outputs, with or without thinking Gemini 2.5 Flash Preview TTS gemini-2.5-flash-preview-tts Text Audio Low latency, controllable, single- and multi-speaker text-to-speech audio generation Gemini 2.5 Pro Preview TTS gemini-2.5-pro-preview-tts Text Audio Low latency, controllable, single- and multi-speaker text-to-speech audio generation Gemini 2.0 Flash gemini-2.0-flash Audio, images, videos, and text Text Next generation features, speed, and realtime streaming. Gemini 2.0 Flash Preview Image Generation gemini-2.0-flash-preview-image-generation Audio, images, videos, and text Text, images Conversational image generation and editing Gemini 2.0 Flash-Lite gemini-2.0-flash-lite Audio, images, videos, and text Text Cost efficiency and low latency Gemini 1.5 Flash gemini-1.5-flash Audio, images, videos, and text Text Fast and versatile performance across a diverse variety of tasks Gemini 1.5 Flash-8B gemini-1.5-flash-8b Audio, images, videos, and text Text High volume and lower intelligence tasks Gemini 1.5 Pro gemini-1.5-pro Audio, images, videos, and text Text Complex reasoning tasks requiring more intelligence Gemini Embedding gemini-embedding-exp Text Text embeddings Measuring the relatedness of text strings Imagen 4 imagen-4.0-generate-preview-06-06 imagen-4.0-ultra-generate-preview-06-06 Text Images Our most up-to-date image generation model Imagen 3 imagen-3.0-generate-002 Text Images High quality image generation model Veo 2 veo-2.0-generate-001 Text, images Video High quality video generation Gemini 2.5 Flash Live gemini-live-2.5-flash-preview Audio, video, and text Text, audio Low-latency bidirectional voice and video interactions Gemini 2.0 Flash Live gemini-2.0-flash-live-001 Audio, video, and text Text, audio Low-latency bidirectional voice and video interactions You can view the rate limits for each model on the rate limits page . Gemini 2.5 Pro Gemini 2.5 Pro is our state-of-the-art thinking model, capable of reasoning over complex problems in code, math, and STEM, as well as analyzing large datasets, codebases, and documents using long context. Try in Google AI Studio Model details Property Description id_card Model code gemini-2.5-pro save Supported data types Inputs Audio, images, video, text, and PDF Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 65,536 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Search grounding Supported Image generation Not supported Audio generation Not supported Live API Not supported Thinking Supported Batch API Supported 123 Versions Read the model version patterns for more details. Stable: gemini-2.5-pro Preview: gemini-2.5-pro-preview-06-05 Preview: gemini-2.5-pro-preview-05-06 Preview: gemini-2.5-pro-preview-03-25 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Our best model in terms of price-performance, offering well-rounded capabilities. 2.5 Flash is best for large scale processing, low-latency, high volume tasks that require thinking, and agentic use cases. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash save Supported data types Inputs Text, images, video, audio Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 65,536 handyman Capabilities Audio generation Not supported Caching Supported Code execution Supported Function calling Supported Image generation Not supported Search grounding Supported Structured outputs Supported Thinking Supported Tuning Not supported Batch API Supported 123 Versions Read the model version patterns for more details. Stable: gemini-2.5-flash Preview: gemini-2.5-flash-preview-05-20 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash-Lite Preview A Gemini 2.5 Flash model optimized for cost efficiency and low latency. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-lite-preview-06-17 save Supported data types Inputs Text, images, video, and audio Output Text token_auto Token limits [*] Input token limit 1,000,000 Output token limit 64,000 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported URL Context Supported Search grounding Supported Image generation Not supported Audio generation Not supported Live API Not supported Thinking Supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.5-flash-lite-preview-06-17 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Native Audio Our native audio dialog models, with and without thinking, available through the Live API . These models provide interactive and unstructured conversational experiences, with style and control prompting. Try native audio in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-preview-native-audio-dialog & models/gemini-2.5-flash-exp-native-audio-thinking-dialog save Supported data types Inputs Audio, video, text Output Audio and text token_auto Token limits [*] Input token limit 128,000 Output token limit 8,000 handyman Capabilities Audio generation Supported Caching Not supported Code execution Not supported Function calling Supported Image generation Not supported Search grounding Supported Structured outputs Not supported Thinking Supported Tuning Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.5-flash-preview-05-20 Experimental: gemini-2.5-flash-exp-native-audio-thinking-dialog calendar_month Latest update May 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Preview Text-to-Speech Gemini 2.5 Flash Preview TTS is our price-performant text-to-speech model, delivering high control and transparency for structured workflows like podcast generation, audiobooks, customer support, and more. Gemini 2.5 Flash rate limits are more restricted since it is an experimental / preview model. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-preview-tts save Supported data types Inputs Text Output Audio token_auto Token limits [*] Input token limit 8,000 Output token limit 16,000 handyman Capabilities Structured outputs Not supported Caching Not supported Tuning Not supported Function calling Not supported Code execution Not supported Search Not supported Audio generation Supported Live API Not supported Thinking Not supported 123 Versions Read the model version patterns for more details. gemini-2.5-flash-preview-tts calendar_month Latest update May 2025 Gemini 2.5 Pro Preview Text-to-Speech Gemini 2.5 Pro Preview TTS is our most powerful text-to-speech model, delivering high control and transparency for structured workflows like podcast generation, audiobooks, customer support, and more. Gemini 2.5 Pro rate limits are more restricted since it is an experimental / preview model. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-pro-preview-tts save Supported data types Inputs Text Output Audio token_auto Token limits [*] Input token limit 8,000 Output token limit 16,000 handyman Capabilities Structured outputs Not supported Caching Not supported Tuning Not supported Function calling Not supported Code execution Not supported Search Not supported Audio generation Supported Live API Not supported Thinking Not supported 123 Versions Read the model version patterns for more details. gemini-2.5-pro-preview-tts calendar_month Latest update May 2025 Gemini 2.0 Flash Gemini 2.0 Flash delivers next-gen features and improved capabilities, including superior speed, native tool use, and a 1M token context window. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Not supported Live API Supported Thinking Experimental Batch API Supported 123 Versions Read the model version patterns for more details. Latest: gemini-2.0-flash Stable: gemini-2.0-flash-001 Experimental: gemini-2.0-flash-exp calendar_month Latest update February 2025 cognition_2 Knowledge cutoff August 2024 Gemini 2.0 Flash Preview Image Generation Gemini 2.0 Flash Preview Image Generation delivers improved image generation features, including generating and editing images conversationally. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-preview-image-generation save Supported data types Inputs Audio, images, video, and text Output Text and images token_auto Token limits [*] Input token limit 32,000 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Not supported Code execution Not Supported Search Not Supported Image generation Supported Audio generation Not supported Live API Not Supported Thinking Not Supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.0-flash-preview-image-generation gemini-2.0-flash-preview-image-generation is not currently supported in a number of countries in Europe, Middle East & Africa calendar_month Latest update May 2025 cognition_2 Knowledge cutoff August 2024 Gemini 2.0 Flash-Lite A Gemini 2.0 Flash model optimized for cost efficiency and low latency. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-lite save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Not supported Search Not supported Image generation Not supported Audio generation Not supported Live API Not supported Batch API Supported 123 Versions Read the model version patterns for more details. Latest: gemini-2.0-flash-lite Stable: gemini-2.0-flash-lite-001 calendar_month Latest update February 2025 cognition_2 Knowledge cutoff August 2024 Gemini 1.5 Flash Gemini 1.5 Flash is a fast and versatile multimodal model for scaling across diverse tasks. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-flash save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 3,600 Maximum video length 1 hour Maximum audio length Approximately 9.5 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-flash-latest Latest stable: gemini-1.5-flash Stable: gemini-1.5-flash-001 gemini-1.5-flash-002 calendar_month Latest update September 2024 Gemini 1.5 Flash-8B Gemini 1.5 Flash-8B is a small model designed for lower intelligence tasks. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-flash-8b save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 3,600 Maximum video length 1 hour Maximum audio length Approximately 9.5 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-flash-8b-latest Latest stable: gemini-1.5-flash-8b Stable: gemini-1.5-flash-8b-001 calendar_month Latest update October 2024 Gemini 1.5 Pro Try Gemini 2.5 Pro Preview , our most advanced Gemini model to date. Gemini 1.5 Pro is a mid-size multimodal model that is optimized for a wide-range of reasoning tasks. 1.5 Pro can process large amounts of data at once, including 2 hours of video, 19 hours of audio, codebases with 60,000 lines of code, or 2,000 pages of text. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-pro save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 2,097,152 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 7,200 Maximum video length 2 hours Maximum audio length Approximately 19 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-pro-latest Latest stable: gemini-1.5-pro Stable: gemini-1.5-pro-001 gemini-1.5-pro-002 calendar_month Latest update September 2024 Imagen 4 Imagen 4 is our latest image model, capable of generating highly detailed images with rich lighting, significantly better text rendering, and higher resolution output than previous models. Model details Property Description id_card Model code Gemini API imagen-4.0-generate-preview-06-06 imagen-4.0-ultra-generate-preview-06-06 save Supported data types Input Text Output Images token_auto Token limits [*] Input token limit 480 tokens (text) Output images 1 (Ultra) 1 to 4 (Standard) calendar_month Latest update June 2025 Imagen 3 Imagen 3 is our highest quality text-to-image model, capable of generating images with even better detail, richer lighting and fewer distracting artifacts than our previous models. Model details Property Description id_card Model code Gemini API imagen-3.0-generate-002 save Supported data types Input Text Output Images token_auto Token limits [*] Input token limit N/A Output images Up to 4 calendar_month Latest update February 2025 Veo 2 Veo 2 is our high quality text- and image-to-video model, capable of generating detailed videos, capturing the artistic nuance in your prompts. Model details Property Description id_card Model code Gemini API veo-2.0-generate-001 save Supported data types Input Text, image Output Video token_auto Limits Text input N/A Image input Any image resolution and aspect ratio up to 20MB file size Output video Up to 2 calendar_month Latest update April 2025 Gemini 2.5 Flash Live The Gemini 2.5 Flash Live model works with the Live API to enable low-latency bidirectional voice and video interactions with Gemini. The model can process text, audio, and video input, and it can provide text and audio output. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-live-2.5-flash-preview save Supported data types Inputs Audio, video, and text Output Text, and audio token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Supported Thinking Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-live-2.5-flash-preview calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.0 Flash Live The Gemini 2.0 Flash Live model works with the Live API to enable low-latency bidirectional voice and video interactions with Gemini. The model can process text, audio, and video input, and it can provide text and audio output. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-live-001 save Supported data types Inputs Audio, video, and text Output Text, and audio token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Supported Thinking Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.0-flash-live-001 calendar_month Latest update April 2025 cognition_2 Knowledge cutoff August 2024 Gemini Embedding Experimental Gemini embedding achieves a SOTA performance across many key dimensions including code, multi-lingual, and retrieval. Gemini Embedding rate limits are more restricted since it is an experimental model. Model details Property Description id_card Model code Gemini API gemini-embedding-exp-03-07 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 8,192 Output dimension size Elastic, supports: 3072, 1536, or 768 calendar_month Latest update March 2025 Text Embedding and Embedding Text Embedding Try our new experimental Gemini embedding model which achieves state-of-the-art performance. Text embeddings are used to measure the relatedness of strings and are widely used in many AI applications. text-embedding-004 achieves a stronger retrieval performance and outperforms existing models with comparable dimensions, on the standard MTEB embedding benchmarks. Model details Property Description id_card Model code Gemini API models/text-embedding-004 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 2,048 Output dimension size 768 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Not supported calendar_month Latest update April 2024 Embedding Note: Text Embedding is the newer version of the Embedding model. If you're creating a new project, use Text Embedding. You can use the Embedding model to generate text embeddings for input text. The Embedding model is optimized for creating embeddings with 768 dimensions for text of up to 2,048 tokens. Embedding model details Property Description id_card Model code models/embedding-001 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 2,048 Output dimension size 768 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Not supported calendar_month Latest update December 2023 AQA You can use the AQA model to perform Attributed Question-Answering (AQA)–related tasks over a document, corpus, or a set of passages. The AQA model returns answers to questions that are grounded in provided sources, along with estimating answerable probability. Model details Property Description id_card Model code models/aqa save Supported data types Input Text Output Text language Supported language English token_auto Token limits [*] Input token limit 7,168 Output token limit 1,024 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Supported calendar_month Latest update December 2023 See the examples to explore the capabilities of these model variations. [*] A token is equivalent to about 4 characters for Gemini models. 100 tokens are about 60-80 English words. Model version name patterns Gemini models are available in either stable , preview , or experimental versions. In your code, you can use one of the following model name formats to specify which model and version you want to use. Latest stable Points to the most recent stable version released for the specified model generation and variation. To specify the latest stable version, use the following pattern: -- . For example, gemini-2.0-flash . Stable Points to a specific stable model. Stable models usually don't change. Most production apps should use a specific stable model. To specify a stable version, use the following pattern: --- . For example, gemini-2.0-flash-001 . Preview Points to a preview model which may not be suitable for production use, come with more restrictive rate limits, but may have billing enabled. To specify a preview version, use the following pattern: --- . For example, gemini-2.5-pro-preview-06-05 . Experimental Points to an experimental model which may not be suitable for production use and come with more restrictive rate limits. We release experimental models to gather feedback and get our latest updates into the hands of developers quickly. To specify an experimental version, use the following pattern: --- . For example, gemini-2.0-pro-exp-02-05 . Experimental models In addition to stable models, the Gemini API offers experimental models which may not be suitable for production use and come with more restrictive rate limits. We release experimental models to gather feedback, get our latest updates into the hands of developers quickly, and highlight the pace of innovation happening at Google. What we learn from experimental launches informs how we release models more widely. An experimental model can be swapped for another without prior notice. We don't guarantee that an experimental model will become a stable model in the future. Previous experimental models As new versions or stable releases become available, we remove and replace experimental models. You can find the previous experimental models we released in the following section along with the replacement version: Model code Base model Replacement version gemini-2.5-flash-preview-04-17 Gemini 2.5 Flash gemini-2.5-flash-preview-05-20 gemini-2.0-flash-exp-image-generation Gemini 2.0 Flash gemini-2.0-flash-preview-image-generation gemini-2.5-pro-preview-06-05 Gemini 2.5 Pro gemini-2.5-pro gemini-2.5-pro-preview-05-06 Gemini 2.5 Pro gemini-2.5-pro gemini-2.5-pro-preview-03-25 Gemini 2.5 Pro gemini-2.5-pro gemini-2.0-flash-thinking-exp-01-21 Gemini 2.5 Flash gemini-2.5-flash-preview-04-17 gemini-2.0-pro-exp-02-05 Gemini 2.0 Pro Experimental gemini-2.5-pro-preview-03-25 gemini-2.0-flash-exp Gemini 2.0 Flash gemini-2.0-flash gemini-exp-1206 Gemini 2.0 Pro gemini-2.0-pro-exp-02-05 gemini-2.0-flash-thinking-exp-1219 Gemini 2.0 Flash Thinking gemini-2.0-flash-thinking-exp-01-21 gemini-exp-1121 Gemini gemini-exp-1206 gemini-exp-1114 Gemini gemini-exp-1206 gemini-1.5-pro-exp-0827 Gemini 1.5 Pro gemini-exp-1206 gemini-1.5-pro-exp-0801 Gemini 1.5 Pro gemini-exp-1206 gemini-1.5-flash-8b-exp-0924 Gemini 1.5 Flash-8B gemini-1.5-flash-8b gemini-1.5-flash-8b-exp-0827 Gemini 1.5 Flash-8B gemini-1.5-flash-8b Supported languages Gemini models are trained to work with the following languages: Arabic ( ar ) Bengali ( bn ) Bulgarian ( bg ) Chinese simplified and traditional ( zh ) Croatian ( hr ) Czech ( cs ) Danish ( da ) Dutch ( nl ) English ( en ) Estonian ( et ) Finnish ( fi ) French ( fr ) German ( de ) Greek ( el ) Hebrew ( iw ) Hindi ( hi ) Hungarian ( hu ) Indonesian ( id ) Italian ( it ) Japanese ( ja ) Korean ( ko ) Latvian ( lv ) Lithuanian ( lt ) Norwegian ( no ) Polish ( pl ) Portuguese ( pt ) Romanian ( ro ) Russian ( ru ) Serbian ( sr ) Slovak ( sk ) Slovenian ( sl ) Spanish ( es ) Swahili ( sw ) Swedish ( sv ) Thai ( th ) Turkish ( tr ) Ukrainian ( uk ) Vietnamese ( vi ) Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-07 UTC. \ No newline at end of file diff --git a/text_content/models_gemini_865af3cf.txt b/text_content/models_gemini_865af3cf.txt new file mode 100644 index 0000000000000000000000000000000000000000..36b4b20b6c2ce2ebbdfbd00182bc170c814d7d99 --- /dev/null +++ b/text_content/models_gemini_865af3cf.txt @@ -0,0 +1,5 @@ +URL: https://ai.google.dev/gemini-api/docs/models/gemini#model-versions +Title: Gemini models  |  Gemini API  |  Google AI for Developers +================================================== + +Gemini models | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Gemini models 2.5 Pro spark Our most powerful thinking model with maximum response accuracy and state-of-the-art performance Input audio, images, video, and text, get text responses Tackle difficult problems, analyze large databases, and more Best for complex coding, reasoning, and multimodal understanding 2.5 Flash spark Our best model in terms of price-performance, offering well-rounded capabilities. Input audio, images, video, and text, and get text responses Model thinks as needed; or, you can configure a thinking budget Best for low latency, high volume tasks that require thinking 2.5 Flash-Lite experiment A Gemini 2.5 Flash model optimized for cost efficiency and low latency. Input audio, images, video, and text, and get text responses Most cost-efficient model supporting high throughput Best for real time, low latency use cases Note: Gemini 2.5 Pro and 2.5 Flash come with thinking on by default . If you're migrating from a non-thinking model such as 2.0 Pro or Flash, we recommend you to review the Thinking guide first. Model variants The Gemini API offers different models that are optimized for specific use cases. Here's a brief overview of Gemini variants that are available: Model variant Input(s) Output Optimized for Gemini 2.5 Pro gemini-2.5-pro Audio, images, videos, text, and PDF Text Enhanced thinking and reasoning, multimodal understanding, advanced coding, and more Gemini 2.5 Flash gemini-2.5-flash Audio, images, videos, and text Text Adaptive thinking, cost efficiency Gemini 2.5 Flash-Lite Preview gemini-2.5-flash-lite-preview-06-17 Text, image, video, audio Text Most cost-efficient model supporting high throughput Gemini 2.5 Flash Native Audio gemini-2.5-flash-preview-native-audio-dialog & gemini-2.5-flash-exp-native-audio-thinking-dialog Audio, videos, and text Text and audio, interleaved High quality, natural conversational audio outputs, with or without thinking Gemini 2.5 Flash Preview TTS gemini-2.5-flash-preview-tts Text Audio Low latency, controllable, single- and multi-speaker text-to-speech audio generation Gemini 2.5 Pro Preview TTS gemini-2.5-pro-preview-tts Text Audio Low latency, controllable, single- and multi-speaker text-to-speech audio generation Gemini 2.0 Flash gemini-2.0-flash Audio, images, videos, and text Text Next generation features, speed, and realtime streaming. Gemini 2.0 Flash Preview Image Generation gemini-2.0-flash-preview-image-generation Audio, images, videos, and text Text, images Conversational image generation and editing Gemini 2.0 Flash-Lite gemini-2.0-flash-lite Audio, images, videos, and text Text Cost efficiency and low latency Gemini 1.5 Flash gemini-1.5-flash Audio, images, videos, and text Text Fast and versatile performance across a diverse variety of tasks Gemini 1.5 Flash-8B gemini-1.5-flash-8b Audio, images, videos, and text Text High volume and lower intelligence tasks Gemini 1.5 Pro gemini-1.5-pro Audio, images, videos, and text Text Complex reasoning tasks requiring more intelligence Gemini Embedding gemini-embedding-exp Text Text embeddings Measuring the relatedness of text strings Imagen 4 imagen-4.0-generate-preview-06-06 imagen-4.0-ultra-generate-preview-06-06 Text Images Our most up-to-date image generation model Imagen 3 imagen-3.0-generate-002 Text Images High quality image generation model Veo 2 veo-2.0-generate-001 Text, images Video High quality video generation Gemini 2.5 Flash Live gemini-live-2.5-flash-preview Audio, video, and text Text, audio Low-latency bidirectional voice and video interactions Gemini 2.0 Flash Live gemini-2.0-flash-live-001 Audio, video, and text Text, audio Low-latency bidirectional voice and video interactions You can view the rate limits for each model on the rate limits page . Gemini 2.5 Pro Gemini 2.5 Pro is our state-of-the-art thinking model, capable of reasoning over complex problems in code, math, and STEM, as well as analyzing large datasets, codebases, and documents using long context. Try in Google AI Studio Model details Property Description id_card Model code gemini-2.5-pro save Supported data types Inputs Audio, images, video, text, and PDF Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 65,536 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Search grounding Supported Image generation Not supported Audio generation Not supported Live API Not supported Thinking Supported Batch API Supported 123 Versions Read the model version patterns for more details. Stable: gemini-2.5-pro Preview: gemini-2.5-pro-preview-06-05 Preview: gemini-2.5-pro-preview-05-06 Preview: gemini-2.5-pro-preview-03-25 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Our best model in terms of price-performance, offering well-rounded capabilities. 2.5 Flash is best for large scale processing, low-latency, high volume tasks that require thinking, and agentic use cases. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash save Supported data types Inputs Text, images, video, audio Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 65,536 handyman Capabilities Audio generation Not supported Caching Supported Code execution Supported Function calling Supported Image generation Not supported Search grounding Supported Structured outputs Supported Thinking Supported Tuning Not supported Batch API Supported 123 Versions Read the model version patterns for more details. Stable: gemini-2.5-flash Preview: gemini-2.5-flash-preview-05-20 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash-Lite Preview A Gemini 2.5 Flash model optimized for cost efficiency and low latency. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-lite-preview-06-17 save Supported data types Inputs Text, images, video, and audio Output Text token_auto Token limits [*] Input token limit 1,000,000 Output token limit 64,000 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported URL Context Supported Search grounding Supported Image generation Not supported Audio generation Not supported Live API Not supported Thinking Supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.5-flash-lite-preview-06-17 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Native Audio Our native audio dialog models, with and without thinking, available through the Live API . These models provide interactive and unstructured conversational experiences, with style and control prompting. Try native audio in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-preview-native-audio-dialog & models/gemini-2.5-flash-exp-native-audio-thinking-dialog save Supported data types Inputs Audio, video, text Output Audio and text token_auto Token limits [*] Input token limit 128,000 Output token limit 8,000 handyman Capabilities Audio generation Supported Caching Not supported Code execution Not supported Function calling Supported Image generation Not supported Search grounding Supported Structured outputs Not supported Thinking Supported Tuning Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.5-flash-preview-05-20 Experimental: gemini-2.5-flash-exp-native-audio-thinking-dialog calendar_month Latest update May 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Preview Text-to-Speech Gemini 2.5 Flash Preview TTS is our price-performant text-to-speech model, delivering high control and transparency for structured workflows like podcast generation, audiobooks, customer support, and more. Gemini 2.5 Flash rate limits are more restricted since it is an experimental / preview model. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-preview-tts save Supported data types Inputs Text Output Audio token_auto Token limits [*] Input token limit 8,000 Output token limit 16,000 handyman Capabilities Structured outputs Not supported Caching Not supported Tuning Not supported Function calling Not supported Code execution Not supported Search Not supported Audio generation Supported Live API Not supported Thinking Not supported 123 Versions Read the model version patterns for more details. gemini-2.5-flash-preview-tts calendar_month Latest update May 2025 Gemini 2.5 Pro Preview Text-to-Speech Gemini 2.5 Pro Preview TTS is our most powerful text-to-speech model, delivering high control and transparency for structured workflows like podcast generation, audiobooks, customer support, and more. Gemini 2.5 Pro rate limits are more restricted since it is an experimental / preview model. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-pro-preview-tts save Supported data types Inputs Text Output Audio token_auto Token limits [*] Input token limit 8,000 Output token limit 16,000 handyman Capabilities Structured outputs Not supported Caching Not supported Tuning Not supported Function calling Not supported Code execution Not supported Search Not supported Audio generation Supported Live API Not supported Thinking Not supported 123 Versions Read the model version patterns for more details. gemini-2.5-pro-preview-tts calendar_month Latest update May 2025 Gemini 2.0 Flash Gemini 2.0 Flash delivers next-gen features and improved capabilities, including superior speed, native tool use, and a 1M token context window. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Not supported Live API Supported Thinking Experimental Batch API Supported 123 Versions Read the model version patterns for more details. Latest: gemini-2.0-flash Stable: gemini-2.0-flash-001 Experimental: gemini-2.0-flash-exp calendar_month Latest update February 2025 cognition_2 Knowledge cutoff August 2024 Gemini 2.0 Flash Preview Image Generation Gemini 2.0 Flash Preview Image Generation delivers improved image generation features, including generating and editing images conversationally. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-preview-image-generation save Supported data types Inputs Audio, images, video, and text Output Text and images token_auto Token limits [*] Input token limit 32,000 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Not supported Code execution Not Supported Search Not Supported Image generation Supported Audio generation Not supported Live API Not Supported Thinking Not Supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.0-flash-preview-image-generation gemini-2.0-flash-preview-image-generation is not currently supported in a number of countries in Europe, Middle East & Africa calendar_month Latest update May 2025 cognition_2 Knowledge cutoff August 2024 Gemini 2.0 Flash-Lite A Gemini 2.0 Flash model optimized for cost efficiency and low latency. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-lite save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Not supported Search Not supported Image generation Not supported Audio generation Not supported Live API Not supported Batch API Supported 123 Versions Read the model version patterns for more details. Latest: gemini-2.0-flash-lite Stable: gemini-2.0-flash-lite-001 calendar_month Latest update February 2025 cognition_2 Knowledge cutoff August 2024 Gemini 1.5 Flash Gemini 1.5 Flash is a fast and versatile multimodal model for scaling across diverse tasks. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-flash save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 3,600 Maximum video length 1 hour Maximum audio length Approximately 9.5 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-flash-latest Latest stable: gemini-1.5-flash Stable: gemini-1.5-flash-001 gemini-1.5-flash-002 calendar_month Latest update September 2024 Gemini 1.5 Flash-8B Gemini 1.5 Flash-8B is a small model designed for lower intelligence tasks. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-flash-8b save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 3,600 Maximum video length 1 hour Maximum audio length Approximately 9.5 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-flash-8b-latest Latest stable: gemini-1.5-flash-8b Stable: gemini-1.5-flash-8b-001 calendar_month Latest update October 2024 Gemini 1.5 Pro Try Gemini 2.5 Pro Preview , our most advanced Gemini model to date. Gemini 1.5 Pro is a mid-size multimodal model that is optimized for a wide-range of reasoning tasks. 1.5 Pro can process large amounts of data at once, including 2 hours of video, 19 hours of audio, codebases with 60,000 lines of code, or 2,000 pages of text. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-pro save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 2,097,152 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 7,200 Maximum video length 2 hours Maximum audio length Approximately 19 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-pro-latest Latest stable: gemini-1.5-pro Stable: gemini-1.5-pro-001 gemini-1.5-pro-002 calendar_month Latest update September 2024 Imagen 4 Imagen 4 is our latest image model, capable of generating highly detailed images with rich lighting, significantly better text rendering, and higher resolution output than previous models. Model details Property Description id_card Model code Gemini API imagen-4.0-generate-preview-06-06 imagen-4.0-ultra-generate-preview-06-06 save Supported data types Input Text Output Images token_auto Token limits [*] Input token limit 480 tokens (text) Output images 1 (Ultra) 1 to 4 (Standard) calendar_month Latest update June 2025 Imagen 3 Imagen 3 is our highest quality text-to-image model, capable of generating images with even better detail, richer lighting and fewer distracting artifacts than our previous models. Model details Property Description id_card Model code Gemini API imagen-3.0-generate-002 save Supported data types Input Text Output Images token_auto Token limits [*] Input token limit N/A Output images Up to 4 calendar_month Latest update February 2025 Veo 2 Veo 2 is our high quality text- and image-to-video model, capable of generating detailed videos, capturing the artistic nuance in your prompts. Model details Property Description id_card Model code Gemini API veo-2.0-generate-001 save Supported data types Input Text, image Output Video token_auto Limits Text input N/A Image input Any image resolution and aspect ratio up to 20MB file size Output video Up to 2 calendar_month Latest update April 2025 Gemini 2.5 Flash Live The Gemini 2.5 Flash Live model works with the Live API to enable low-latency bidirectional voice and video interactions with Gemini. The model can process text, audio, and video input, and it can provide text and audio output. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-live-2.5-flash-preview save Supported data types Inputs Audio, video, and text Output Text, and audio token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Supported Thinking Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-live-2.5-flash-preview calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.0 Flash Live The Gemini 2.0 Flash Live model works with the Live API to enable low-latency bidirectional voice and video interactions with Gemini. The model can process text, audio, and video input, and it can provide text and audio output. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-live-001 save Supported data types Inputs Audio, video, and text Output Text, and audio token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Supported Thinking Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.0-flash-live-001 calendar_month Latest update April 2025 cognition_2 Knowledge cutoff August 2024 Gemini Embedding Experimental Gemini embedding achieves a SOTA performance across many key dimensions including code, multi-lingual, and retrieval. Gemini Embedding rate limits are more restricted since it is an experimental model. Model details Property Description id_card Model code Gemini API gemini-embedding-exp-03-07 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 8,192 Output dimension size Elastic, supports: 3072, 1536, or 768 calendar_month Latest update March 2025 Text Embedding and Embedding Text Embedding Try our new experimental Gemini embedding model which achieves state-of-the-art performance. Text embeddings are used to measure the relatedness of strings and are widely used in many AI applications. text-embedding-004 achieves a stronger retrieval performance and outperforms existing models with comparable dimensions, on the standard MTEB embedding benchmarks. Model details Property Description id_card Model code Gemini API models/text-embedding-004 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 2,048 Output dimension size 768 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Not supported calendar_month Latest update April 2024 Embedding Note: Text Embedding is the newer version of the Embedding model. If you're creating a new project, use Text Embedding. You can use the Embedding model to generate text embeddings for input text. The Embedding model is optimized for creating embeddings with 768 dimensions for text of up to 2,048 tokens. Embedding model details Property Description id_card Model code models/embedding-001 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 2,048 Output dimension size 768 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Not supported calendar_month Latest update December 2023 AQA You can use the AQA model to perform Attributed Question-Answering (AQA)–related tasks over a document, corpus, or a set of passages. The AQA model returns answers to questions that are grounded in provided sources, along with estimating answerable probability. Model details Property Description id_card Model code models/aqa save Supported data types Input Text Output Text language Supported language English token_auto Token limits [*] Input token limit 7,168 Output token limit 1,024 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Supported calendar_month Latest update December 2023 See the examples to explore the capabilities of these model variations. [*] A token is equivalent to about 4 characters for Gemini models. 100 tokens are about 60-80 English words. Model version name patterns Gemini models are available in either stable , preview , or experimental versions. In your code, you can use one of the following model name formats to specify which model and version you want to use. Latest stable Points to the most recent stable version released for the specified model generation and variation. To specify the latest stable version, use the following pattern: -- . For example, gemini-2.0-flash . Stable Points to a specific stable model. Stable models usually don't change. Most production apps should use a specific stable model. To specify a stable version, use the following pattern: --- . For example, gemini-2.0-flash-001 . Preview Points to a preview model which may not be suitable for production use, come with more restrictive rate limits, but may have billing enabled. To specify a preview version, use the following pattern: --- . For example, gemini-2.5-pro-preview-06-05 . Experimental Points to an experimental model which may not be suitable for production use and come with more restrictive rate limits. We release experimental models to gather feedback and get our latest updates into the hands of developers quickly. To specify an experimental version, use the following pattern: --- . For example, gemini-2.0-pro-exp-02-05 . Experimental models In addition to stable models, the Gemini API offers experimental models which may not be suitable for production use and come with more restrictive rate limits. We release experimental models to gather feedback, get our latest updates into the hands of developers quickly, and highlight the pace of innovation happening at Google. What we learn from experimental launches informs how we release models more widely. An experimental model can be swapped for another without prior notice. We don't guarantee that an experimental model will become a stable model in the future. Previous experimental models As new versions or stable releases become available, we remove and replace experimental models. You can find the previous experimental models we released in the following section along with the replacement version: Model code Base model Replacement version gemini-2.5-flash-preview-04-17 Gemini 2.5 Flash gemini-2.5-flash-preview-05-20 gemini-2.0-flash-exp-image-generation Gemini 2.0 Flash gemini-2.0-flash-preview-image-generation gemini-2.5-pro-preview-06-05 Gemini 2.5 Pro gemini-2.5-pro gemini-2.5-pro-preview-05-06 Gemini 2.5 Pro gemini-2.5-pro gemini-2.5-pro-preview-03-25 Gemini 2.5 Pro gemini-2.5-pro gemini-2.0-flash-thinking-exp-01-21 Gemini 2.5 Flash gemini-2.5-flash-preview-04-17 gemini-2.0-pro-exp-02-05 Gemini 2.0 Pro Experimental gemini-2.5-pro-preview-03-25 gemini-2.0-flash-exp Gemini 2.0 Flash gemini-2.0-flash gemini-exp-1206 Gemini 2.0 Pro gemini-2.0-pro-exp-02-05 gemini-2.0-flash-thinking-exp-1219 Gemini 2.0 Flash Thinking gemini-2.0-flash-thinking-exp-01-21 gemini-exp-1121 Gemini gemini-exp-1206 gemini-exp-1114 Gemini gemini-exp-1206 gemini-1.5-pro-exp-0827 Gemini 1.5 Pro gemini-exp-1206 gemini-1.5-pro-exp-0801 Gemini 1.5 Pro gemini-exp-1206 gemini-1.5-flash-8b-exp-0924 Gemini 1.5 Flash-8B gemini-1.5-flash-8b gemini-1.5-flash-8b-exp-0827 Gemini 1.5 Flash-8B gemini-1.5-flash-8b Supported languages Gemini models are trained to work with the following languages: Arabic ( ar ) Bengali ( bn ) Bulgarian ( bg ) Chinese simplified and traditional ( zh ) Croatian ( hr ) Czech ( cs ) Danish ( da ) Dutch ( nl ) English ( en ) Estonian ( et ) Finnish ( fi ) French ( fr ) German ( de ) Greek ( el ) Hebrew ( iw ) Hindi ( hi ) Hungarian ( hu ) Indonesian ( id ) Italian ( it ) Japanese ( ja ) Korean ( ko ) Latvian ( lv ) Lithuanian ( lt ) Norwegian ( no ) Polish ( pl ) Portuguese ( pt ) Romanian ( ro ) Russian ( ru ) Serbian ( sr ) Slovak ( sk ) Slovenian ( sl ) Spanish ( es ) Swahili ( sw ) Swedish ( sv ) Thai ( th ) Turkish ( tr ) Ukrainian ( uk ) Vietnamese ( vi ) Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-07 UTC. \ No newline at end of file diff --git a/text_content/models_gemini_9230d9f0.txt b/text_content/models_gemini_9230d9f0.txt new file mode 100644 index 0000000000000000000000000000000000000000..356fd77cc3db97606c909934684d39c626dac3fb --- /dev/null +++ b/text_content/models_gemini_9230d9f0.txt @@ -0,0 +1,5 @@ +URL: https://ai.google.dev/gemini-api/docs/models/gemini#gemini-2.0-flash +Title: Gemini models  |  Gemini API  |  Google AI for Developers +================================================== + +Gemini models | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Gemini models 2.5 Pro spark Our most powerful thinking model with maximum response accuracy and state-of-the-art performance Input audio, images, video, and text, get text responses Tackle difficult problems, analyze large databases, and more Best for complex coding, reasoning, and multimodal understanding 2.5 Flash spark Our best model in terms of price-performance, offering well-rounded capabilities. Input audio, images, video, and text, and get text responses Model thinks as needed; or, you can configure a thinking budget Best for low latency, high volume tasks that require thinking 2.5 Flash-Lite experiment A Gemini 2.5 Flash model optimized for cost efficiency and low latency. Input audio, images, video, and text, and get text responses Most cost-efficient model supporting high throughput Best for real time, low latency use cases Note: Gemini 2.5 Pro and 2.5 Flash come with thinking on by default . If you're migrating from a non-thinking model such as 2.0 Pro or Flash, we recommend you to review the Thinking guide first. Model variants The Gemini API offers different models that are optimized for specific use cases. Here's a brief overview of Gemini variants that are available: Model variant Input(s) Output Optimized for Gemini 2.5 Pro gemini-2.5-pro Audio, images, videos, text, and PDF Text Enhanced thinking and reasoning, multimodal understanding, advanced coding, and more Gemini 2.5 Flash gemini-2.5-flash Audio, images, videos, and text Text Adaptive thinking, cost efficiency Gemini 2.5 Flash-Lite Preview gemini-2.5-flash-lite-preview-06-17 Text, image, video, audio Text Most cost-efficient model supporting high throughput Gemini 2.5 Flash Native Audio gemini-2.5-flash-preview-native-audio-dialog & gemini-2.5-flash-exp-native-audio-thinking-dialog Audio, videos, and text Text and audio, interleaved High quality, natural conversational audio outputs, with or without thinking Gemini 2.5 Flash Preview TTS gemini-2.5-flash-preview-tts Text Audio Low latency, controllable, single- and multi-speaker text-to-speech audio generation Gemini 2.5 Pro Preview TTS gemini-2.5-pro-preview-tts Text Audio Low latency, controllable, single- and multi-speaker text-to-speech audio generation Gemini 2.0 Flash gemini-2.0-flash Audio, images, videos, and text Text Next generation features, speed, and realtime streaming. Gemini 2.0 Flash Preview Image Generation gemini-2.0-flash-preview-image-generation Audio, images, videos, and text Text, images Conversational image generation and editing Gemini 2.0 Flash-Lite gemini-2.0-flash-lite Audio, images, videos, and text Text Cost efficiency and low latency Gemini 1.5 Flash gemini-1.5-flash Audio, images, videos, and text Text Fast and versatile performance across a diverse variety of tasks Gemini 1.5 Flash-8B gemini-1.5-flash-8b Audio, images, videos, and text Text High volume and lower intelligence tasks Gemini 1.5 Pro gemini-1.5-pro Audio, images, videos, and text Text Complex reasoning tasks requiring more intelligence Gemini Embedding gemini-embedding-exp Text Text embeddings Measuring the relatedness of text strings Imagen 4 imagen-4.0-generate-preview-06-06 imagen-4.0-ultra-generate-preview-06-06 Text Images Our most up-to-date image generation model Imagen 3 imagen-3.0-generate-002 Text Images High quality image generation model Veo 2 veo-2.0-generate-001 Text, images Video High quality video generation Gemini 2.5 Flash Live gemini-live-2.5-flash-preview Audio, video, and text Text, audio Low-latency bidirectional voice and video interactions Gemini 2.0 Flash Live gemini-2.0-flash-live-001 Audio, video, and text Text, audio Low-latency bidirectional voice and video interactions You can view the rate limits for each model on the rate limits page . Gemini 2.5 Pro Gemini 2.5 Pro is our state-of-the-art thinking model, capable of reasoning over complex problems in code, math, and STEM, as well as analyzing large datasets, codebases, and documents using long context. Try in Google AI Studio Model details Property Description id_card Model code gemini-2.5-pro save Supported data types Inputs Audio, images, video, text, and PDF Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 65,536 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Search grounding Supported Image generation Not supported Audio generation Not supported Live API Not supported Thinking Supported Batch API Supported 123 Versions Read the model version patterns for more details. Stable: gemini-2.5-pro Preview: gemini-2.5-pro-preview-06-05 Preview: gemini-2.5-pro-preview-05-06 Preview: gemini-2.5-pro-preview-03-25 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Our best model in terms of price-performance, offering well-rounded capabilities. 2.5 Flash is best for large scale processing, low-latency, high volume tasks that require thinking, and agentic use cases. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash save Supported data types Inputs Text, images, video, audio Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 65,536 handyman Capabilities Audio generation Not supported Caching Supported Code execution Supported Function calling Supported Image generation Not supported Search grounding Supported Structured outputs Supported Thinking Supported Tuning Not supported Batch API Supported 123 Versions Read the model version patterns for more details. Stable: gemini-2.5-flash Preview: gemini-2.5-flash-preview-05-20 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash-Lite Preview A Gemini 2.5 Flash model optimized for cost efficiency and low latency. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-lite-preview-06-17 save Supported data types Inputs Text, images, video, and audio Output Text token_auto Token limits [*] Input token limit 1,000,000 Output token limit 64,000 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported URL Context Supported Search grounding Supported Image generation Not supported Audio generation Not supported Live API Not supported Thinking Supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.5-flash-lite-preview-06-17 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Native Audio Our native audio dialog models, with and without thinking, available through the Live API . These models provide interactive and unstructured conversational experiences, with style and control prompting. Try native audio in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-preview-native-audio-dialog & models/gemini-2.5-flash-exp-native-audio-thinking-dialog save Supported data types Inputs Audio, video, text Output Audio and text token_auto Token limits [*] Input token limit 128,000 Output token limit 8,000 handyman Capabilities Audio generation Supported Caching Not supported Code execution Not supported Function calling Supported Image generation Not supported Search grounding Supported Structured outputs Not supported Thinking Supported Tuning Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.5-flash-preview-05-20 Experimental: gemini-2.5-flash-exp-native-audio-thinking-dialog calendar_month Latest update May 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Preview Text-to-Speech Gemini 2.5 Flash Preview TTS is our price-performant text-to-speech model, delivering high control and transparency for structured workflows like podcast generation, audiobooks, customer support, and more. Gemini 2.5 Flash rate limits are more restricted since it is an experimental / preview model. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-preview-tts save Supported data types Inputs Text Output Audio token_auto Token limits [*] Input token limit 8,000 Output token limit 16,000 handyman Capabilities Structured outputs Not supported Caching Not supported Tuning Not supported Function calling Not supported Code execution Not supported Search Not supported Audio generation Supported Live API Not supported Thinking Not supported 123 Versions Read the model version patterns for more details. gemini-2.5-flash-preview-tts calendar_month Latest update May 2025 Gemini 2.5 Pro Preview Text-to-Speech Gemini 2.5 Pro Preview TTS is our most powerful text-to-speech model, delivering high control and transparency for structured workflows like podcast generation, audiobooks, customer support, and more. Gemini 2.5 Pro rate limits are more restricted since it is an experimental / preview model. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-pro-preview-tts save Supported data types Inputs Text Output Audio token_auto Token limits [*] Input token limit 8,000 Output token limit 16,000 handyman Capabilities Structured outputs Not supported Caching Not supported Tuning Not supported Function calling Not supported Code execution Not supported Search Not supported Audio generation Supported Live API Not supported Thinking Not supported 123 Versions Read the model version patterns for more details. gemini-2.5-pro-preview-tts calendar_month Latest update May 2025 Gemini 2.0 Flash Gemini 2.0 Flash delivers next-gen features and improved capabilities, including superior speed, native tool use, and a 1M token context window. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Not supported Live API Supported Thinking Experimental Batch API Supported 123 Versions Read the model version patterns for more details. Latest: gemini-2.0-flash Stable: gemini-2.0-flash-001 Experimental: gemini-2.0-flash-exp calendar_month Latest update February 2025 cognition_2 Knowledge cutoff August 2024 Gemini 2.0 Flash Preview Image Generation Gemini 2.0 Flash Preview Image Generation delivers improved image generation features, including generating and editing images conversationally. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-preview-image-generation save Supported data types Inputs Audio, images, video, and text Output Text and images token_auto Token limits [*] Input token limit 32,000 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Not supported Code execution Not Supported Search Not Supported Image generation Supported Audio generation Not supported Live API Not Supported Thinking Not Supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.0-flash-preview-image-generation gemini-2.0-flash-preview-image-generation is not currently supported in a number of countries in Europe, Middle East & Africa calendar_month Latest update May 2025 cognition_2 Knowledge cutoff August 2024 Gemini 2.0 Flash-Lite A Gemini 2.0 Flash model optimized for cost efficiency and low latency. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-lite save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Not supported Search Not supported Image generation Not supported Audio generation Not supported Live API Not supported Batch API Supported 123 Versions Read the model version patterns for more details. Latest: gemini-2.0-flash-lite Stable: gemini-2.0-flash-lite-001 calendar_month Latest update February 2025 cognition_2 Knowledge cutoff August 2024 Gemini 1.5 Flash Gemini 1.5 Flash is a fast and versatile multimodal model for scaling across diverse tasks. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-flash save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 3,600 Maximum video length 1 hour Maximum audio length Approximately 9.5 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-flash-latest Latest stable: gemini-1.5-flash Stable: gemini-1.5-flash-001 gemini-1.5-flash-002 calendar_month Latest update September 2024 Gemini 1.5 Flash-8B Gemini 1.5 Flash-8B is a small model designed for lower intelligence tasks. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-flash-8b save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 3,600 Maximum video length 1 hour Maximum audio length Approximately 9.5 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-flash-8b-latest Latest stable: gemini-1.5-flash-8b Stable: gemini-1.5-flash-8b-001 calendar_month Latest update October 2024 Gemini 1.5 Pro Try Gemini 2.5 Pro Preview , our most advanced Gemini model to date. Gemini 1.5 Pro is a mid-size multimodal model that is optimized for a wide-range of reasoning tasks. 1.5 Pro can process large amounts of data at once, including 2 hours of video, 19 hours of audio, codebases with 60,000 lines of code, or 2,000 pages of text. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-pro save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 2,097,152 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 7,200 Maximum video length 2 hours Maximum audio length Approximately 19 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-pro-latest Latest stable: gemini-1.5-pro Stable: gemini-1.5-pro-001 gemini-1.5-pro-002 calendar_month Latest update September 2024 Imagen 4 Imagen 4 is our latest image model, capable of generating highly detailed images with rich lighting, significantly better text rendering, and higher resolution output than previous models. Model details Property Description id_card Model code Gemini API imagen-4.0-generate-preview-06-06 imagen-4.0-ultra-generate-preview-06-06 save Supported data types Input Text Output Images token_auto Token limits [*] Input token limit 480 tokens (text) Output images 1 (Ultra) 1 to 4 (Standard) calendar_month Latest update June 2025 Imagen 3 Imagen 3 is our highest quality text-to-image model, capable of generating images with even better detail, richer lighting and fewer distracting artifacts than our previous models. Model details Property Description id_card Model code Gemini API imagen-3.0-generate-002 save Supported data types Input Text Output Images token_auto Token limits [*] Input token limit N/A Output images Up to 4 calendar_month Latest update February 2025 Veo 2 Veo 2 is our high quality text- and image-to-video model, capable of generating detailed videos, capturing the artistic nuance in your prompts. Model details Property Description id_card Model code Gemini API veo-2.0-generate-001 save Supported data types Input Text, image Output Video token_auto Limits Text input N/A Image input Any image resolution and aspect ratio up to 20MB file size Output video Up to 2 calendar_month Latest update April 2025 Gemini 2.5 Flash Live The Gemini 2.5 Flash Live model works with the Live API to enable low-latency bidirectional voice and video interactions with Gemini. The model can process text, audio, and video input, and it can provide text and audio output. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-live-2.5-flash-preview save Supported data types Inputs Audio, video, and text Output Text, and audio token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Supported Thinking Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-live-2.5-flash-preview calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.0 Flash Live The Gemini 2.0 Flash Live model works with the Live API to enable low-latency bidirectional voice and video interactions with Gemini. The model can process text, audio, and video input, and it can provide text and audio output. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-live-001 save Supported data types Inputs Audio, video, and text Output Text, and audio token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Supported Thinking Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.0-flash-live-001 calendar_month Latest update April 2025 cognition_2 Knowledge cutoff August 2024 Gemini Embedding Experimental Gemini embedding achieves a SOTA performance across many key dimensions including code, multi-lingual, and retrieval. Gemini Embedding rate limits are more restricted since it is an experimental model. Model details Property Description id_card Model code Gemini API gemini-embedding-exp-03-07 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 8,192 Output dimension size Elastic, supports: 3072, 1536, or 768 calendar_month Latest update March 2025 Text Embedding and Embedding Text Embedding Try our new experimental Gemini embedding model which achieves state-of-the-art performance. Text embeddings are used to measure the relatedness of strings and are widely used in many AI applications. text-embedding-004 achieves a stronger retrieval performance and outperforms existing models with comparable dimensions, on the standard MTEB embedding benchmarks. Model details Property Description id_card Model code Gemini API models/text-embedding-004 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 2,048 Output dimension size 768 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Not supported calendar_month Latest update April 2024 Embedding Note: Text Embedding is the newer version of the Embedding model. If you're creating a new project, use Text Embedding. You can use the Embedding model to generate text embeddings for input text. The Embedding model is optimized for creating embeddings with 768 dimensions for text of up to 2,048 tokens. Embedding model details Property Description id_card Model code models/embedding-001 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 2,048 Output dimension size 768 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Not supported calendar_month Latest update December 2023 AQA You can use the AQA model to perform Attributed Question-Answering (AQA)–related tasks over a document, corpus, or a set of passages. The AQA model returns answers to questions that are grounded in provided sources, along with estimating answerable probability. Model details Property Description id_card Model code models/aqa save Supported data types Input Text Output Text language Supported language English token_auto Token limits [*] Input token limit 7,168 Output token limit 1,024 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Supported calendar_month Latest update December 2023 See the examples to explore the capabilities of these model variations. [*] A token is equivalent to about 4 characters for Gemini models. 100 tokens are about 60-80 English words. Model version name patterns Gemini models are available in either stable , preview , or experimental versions. In your code, you can use one of the following model name formats to specify which model and version you want to use. Latest stable Points to the most recent stable version released for the specified model generation and variation. To specify the latest stable version, use the following pattern: -- . For example, gemini-2.0-flash . Stable Points to a specific stable model. Stable models usually don't change. Most production apps should use a specific stable model. To specify a stable version, use the following pattern: --- . For example, gemini-2.0-flash-001 . Preview Points to a preview model which may not be suitable for production use, come with more restrictive rate limits, but may have billing enabled. To specify a preview version, use the following pattern: --- . For example, gemini-2.5-pro-preview-06-05 . Experimental Points to an experimental model which may not be suitable for production use and come with more restrictive rate limits. We release experimental models to gather feedback and get our latest updates into the hands of developers quickly. To specify an experimental version, use the following pattern: --- . For example, gemini-2.0-pro-exp-02-05 . Experimental models In addition to stable models, the Gemini API offers experimental models which may not be suitable for production use and come with more restrictive rate limits. We release experimental models to gather feedback, get our latest updates into the hands of developers quickly, and highlight the pace of innovation happening at Google. What we learn from experimental launches informs how we release models more widely. An experimental model can be swapped for another without prior notice. We don't guarantee that an experimental model will become a stable model in the future. Previous experimental models As new versions or stable releases become available, we remove and replace experimental models. You can find the previous experimental models we released in the following section along with the replacement version: Model code Base model Replacement version gemini-2.5-flash-preview-04-17 Gemini 2.5 Flash gemini-2.5-flash-preview-05-20 gemini-2.0-flash-exp-image-generation Gemini 2.0 Flash gemini-2.0-flash-preview-image-generation gemini-2.5-pro-preview-06-05 Gemini 2.5 Pro gemini-2.5-pro gemini-2.5-pro-preview-05-06 Gemini 2.5 Pro gemini-2.5-pro gemini-2.5-pro-preview-03-25 Gemini 2.5 Pro gemini-2.5-pro gemini-2.0-flash-thinking-exp-01-21 Gemini 2.5 Flash gemini-2.5-flash-preview-04-17 gemini-2.0-pro-exp-02-05 Gemini 2.0 Pro Experimental gemini-2.5-pro-preview-03-25 gemini-2.0-flash-exp Gemini 2.0 Flash gemini-2.0-flash gemini-exp-1206 Gemini 2.0 Pro gemini-2.0-pro-exp-02-05 gemini-2.0-flash-thinking-exp-1219 Gemini 2.0 Flash Thinking gemini-2.0-flash-thinking-exp-01-21 gemini-exp-1121 Gemini gemini-exp-1206 gemini-exp-1114 Gemini gemini-exp-1206 gemini-1.5-pro-exp-0827 Gemini 1.5 Pro gemini-exp-1206 gemini-1.5-pro-exp-0801 Gemini 1.5 Pro gemini-exp-1206 gemini-1.5-flash-8b-exp-0924 Gemini 1.5 Flash-8B gemini-1.5-flash-8b gemini-1.5-flash-8b-exp-0827 Gemini 1.5 Flash-8B gemini-1.5-flash-8b Supported languages Gemini models are trained to work with the following languages: Arabic ( ar ) Bengali ( bn ) Bulgarian ( bg ) Chinese simplified and traditional ( zh ) Croatian ( hr ) Czech ( cs ) Danish ( da ) Dutch ( nl ) English ( en ) Estonian ( et ) Finnish ( fi ) French ( fr ) German ( de ) Greek ( el ) Hebrew ( iw ) Hindi ( hi ) Hungarian ( hu ) Indonesian ( id ) Italian ( it ) Japanese ( ja ) Korean ( ko ) Latvian ( lv ) Lithuanian ( lt ) Norwegian ( no ) Polish ( pl ) Portuguese ( pt ) Romanian ( ro ) Russian ( ru ) Serbian ( sr ) Slovak ( sk ) Slovenian ( sl ) Spanish ( es ) Swahili ( sw ) Swedish ( sv ) Thai ( th ) Turkish ( tr ) Ukrainian ( uk ) Vietnamese ( vi ) Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-07 UTC. \ No newline at end of file diff --git a/text_content/models_gemini_a04cc7ce.txt b/text_content/models_gemini_a04cc7ce.txt new file mode 100644 index 0000000000000000000000000000000000000000..f8e2f5406ddb46234abae5318b52375b0fe06cbd --- /dev/null +++ b/text_content/models_gemini_a04cc7ce.txt @@ -0,0 +1,5 @@ +URL: https://ai.google.dev/gemini-api/docs/models/gemini#live-api-2.0 +Title: Gemini models  |  Gemini API  |  Google AI for Developers +================================================== + +Gemini models | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Gemini models 2.5 Pro spark Our most powerful thinking model with maximum response accuracy and state-of-the-art performance Input audio, images, video, and text, get text responses Tackle difficult problems, analyze large databases, and more Best for complex coding, reasoning, and multimodal understanding 2.5 Flash spark Our best model in terms of price-performance, offering well-rounded capabilities. Input audio, images, video, and text, and get text responses Model thinks as needed; or, you can configure a thinking budget Best for low latency, high volume tasks that require thinking 2.5 Flash-Lite experiment A Gemini 2.5 Flash model optimized for cost efficiency and low latency. Input audio, images, video, and text, and get text responses Most cost-efficient model supporting high throughput Best for real time, low latency use cases Note: Gemini 2.5 Pro and 2.5 Flash come with thinking on by default . If you're migrating from a non-thinking model such as 2.0 Pro or Flash, we recommend you to review the Thinking guide first. Model variants The Gemini API offers different models that are optimized for specific use cases. Here's a brief overview of Gemini variants that are available: Model variant Input(s) Output Optimized for Gemini 2.5 Pro gemini-2.5-pro Audio, images, videos, text, and PDF Text Enhanced thinking and reasoning, multimodal understanding, advanced coding, and more Gemini 2.5 Flash gemini-2.5-flash Audio, images, videos, and text Text Adaptive thinking, cost efficiency Gemini 2.5 Flash-Lite Preview gemini-2.5-flash-lite-preview-06-17 Text, image, video, audio Text Most cost-efficient model supporting high throughput Gemini 2.5 Flash Native Audio gemini-2.5-flash-preview-native-audio-dialog & gemini-2.5-flash-exp-native-audio-thinking-dialog Audio, videos, and text Text and audio, interleaved High quality, natural conversational audio outputs, with or without thinking Gemini 2.5 Flash Preview TTS gemini-2.5-flash-preview-tts Text Audio Low latency, controllable, single- and multi-speaker text-to-speech audio generation Gemini 2.5 Pro Preview TTS gemini-2.5-pro-preview-tts Text Audio Low latency, controllable, single- and multi-speaker text-to-speech audio generation Gemini 2.0 Flash gemini-2.0-flash Audio, images, videos, and text Text Next generation features, speed, and realtime streaming. Gemini 2.0 Flash Preview Image Generation gemini-2.0-flash-preview-image-generation Audio, images, videos, and text Text, images Conversational image generation and editing Gemini 2.0 Flash-Lite gemini-2.0-flash-lite Audio, images, videos, and text Text Cost efficiency and low latency Gemini 1.5 Flash gemini-1.5-flash Audio, images, videos, and text Text Fast and versatile performance across a diverse variety of tasks Gemini 1.5 Flash-8B gemini-1.5-flash-8b Audio, images, videos, and text Text High volume and lower intelligence tasks Gemini 1.5 Pro gemini-1.5-pro Audio, images, videos, and text Text Complex reasoning tasks requiring more intelligence Gemini Embedding gemini-embedding-exp Text Text embeddings Measuring the relatedness of text strings Imagen 4 imagen-4.0-generate-preview-06-06 imagen-4.0-ultra-generate-preview-06-06 Text Images Our most up-to-date image generation model Imagen 3 imagen-3.0-generate-002 Text Images High quality image generation model Veo 2 veo-2.0-generate-001 Text, images Video High quality video generation Gemini 2.5 Flash Live gemini-live-2.5-flash-preview Audio, video, and text Text, audio Low-latency bidirectional voice and video interactions Gemini 2.0 Flash Live gemini-2.0-flash-live-001 Audio, video, and text Text, audio Low-latency bidirectional voice and video interactions You can view the rate limits for each model on the rate limits page . Gemini 2.5 Pro Gemini 2.5 Pro is our state-of-the-art thinking model, capable of reasoning over complex problems in code, math, and STEM, as well as analyzing large datasets, codebases, and documents using long context. Try in Google AI Studio Model details Property Description id_card Model code gemini-2.5-pro save Supported data types Inputs Audio, images, video, text, and PDF Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 65,536 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Search grounding Supported Image generation Not supported Audio generation Not supported Live API Not supported Thinking Supported Batch API Supported 123 Versions Read the model version patterns for more details. Stable: gemini-2.5-pro Preview: gemini-2.5-pro-preview-06-05 Preview: gemini-2.5-pro-preview-05-06 Preview: gemini-2.5-pro-preview-03-25 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Our best model in terms of price-performance, offering well-rounded capabilities. 2.5 Flash is best for large scale processing, low-latency, high volume tasks that require thinking, and agentic use cases. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash save Supported data types Inputs Text, images, video, audio Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 65,536 handyman Capabilities Audio generation Not supported Caching Supported Code execution Supported Function calling Supported Image generation Not supported Search grounding Supported Structured outputs Supported Thinking Supported Tuning Not supported Batch API Supported 123 Versions Read the model version patterns for more details. Stable: gemini-2.5-flash Preview: gemini-2.5-flash-preview-05-20 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash-Lite Preview A Gemini 2.5 Flash model optimized for cost efficiency and low latency. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-lite-preview-06-17 save Supported data types Inputs Text, images, video, and audio Output Text token_auto Token limits [*] Input token limit 1,000,000 Output token limit 64,000 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported URL Context Supported Search grounding Supported Image generation Not supported Audio generation Not supported Live API Not supported Thinking Supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.5-flash-lite-preview-06-17 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Native Audio Our native audio dialog models, with and without thinking, available through the Live API . These models provide interactive and unstructured conversational experiences, with style and control prompting. Try native audio in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-preview-native-audio-dialog & models/gemini-2.5-flash-exp-native-audio-thinking-dialog save Supported data types Inputs Audio, video, text Output Audio and text token_auto Token limits [*] Input token limit 128,000 Output token limit 8,000 handyman Capabilities Audio generation Supported Caching Not supported Code execution Not supported Function calling Supported Image generation Not supported Search grounding Supported Structured outputs Not supported Thinking Supported Tuning Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.5-flash-preview-05-20 Experimental: gemini-2.5-flash-exp-native-audio-thinking-dialog calendar_month Latest update May 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Preview Text-to-Speech Gemini 2.5 Flash Preview TTS is our price-performant text-to-speech model, delivering high control and transparency for structured workflows like podcast generation, audiobooks, customer support, and more. Gemini 2.5 Flash rate limits are more restricted since it is an experimental / preview model. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-preview-tts save Supported data types Inputs Text Output Audio token_auto Token limits [*] Input token limit 8,000 Output token limit 16,000 handyman Capabilities Structured outputs Not supported Caching Not supported Tuning Not supported Function calling Not supported Code execution Not supported Search Not supported Audio generation Supported Live API Not supported Thinking Not supported 123 Versions Read the model version patterns for more details. gemini-2.5-flash-preview-tts calendar_month Latest update May 2025 Gemini 2.5 Pro Preview Text-to-Speech Gemini 2.5 Pro Preview TTS is our most powerful text-to-speech model, delivering high control and transparency for structured workflows like podcast generation, audiobooks, customer support, and more. Gemini 2.5 Pro rate limits are more restricted since it is an experimental / preview model. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-pro-preview-tts save Supported data types Inputs Text Output Audio token_auto Token limits [*] Input token limit 8,000 Output token limit 16,000 handyman Capabilities Structured outputs Not supported Caching Not supported Tuning Not supported Function calling Not supported Code execution Not supported Search Not supported Audio generation Supported Live API Not supported Thinking Not supported 123 Versions Read the model version patterns for more details. gemini-2.5-pro-preview-tts calendar_month Latest update May 2025 Gemini 2.0 Flash Gemini 2.0 Flash delivers next-gen features and improved capabilities, including superior speed, native tool use, and a 1M token context window. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Not supported Live API Supported Thinking Experimental Batch API Supported 123 Versions Read the model version patterns for more details. Latest: gemini-2.0-flash Stable: gemini-2.0-flash-001 Experimental: gemini-2.0-flash-exp calendar_month Latest update February 2025 cognition_2 Knowledge cutoff August 2024 Gemini 2.0 Flash Preview Image Generation Gemini 2.0 Flash Preview Image Generation delivers improved image generation features, including generating and editing images conversationally. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-preview-image-generation save Supported data types Inputs Audio, images, video, and text Output Text and images token_auto Token limits [*] Input token limit 32,000 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Not supported Code execution Not Supported Search Not Supported Image generation Supported Audio generation Not supported Live API Not Supported Thinking Not Supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.0-flash-preview-image-generation gemini-2.0-flash-preview-image-generation is not currently supported in a number of countries in Europe, Middle East & Africa calendar_month Latest update May 2025 cognition_2 Knowledge cutoff August 2024 Gemini 2.0 Flash-Lite A Gemini 2.0 Flash model optimized for cost efficiency and low latency. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-lite save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Not supported Search Not supported Image generation Not supported Audio generation Not supported Live API Not supported Batch API Supported 123 Versions Read the model version patterns for more details. Latest: gemini-2.0-flash-lite Stable: gemini-2.0-flash-lite-001 calendar_month Latest update February 2025 cognition_2 Knowledge cutoff August 2024 Gemini 1.5 Flash Gemini 1.5 Flash is a fast and versatile multimodal model for scaling across diverse tasks. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-flash save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 3,600 Maximum video length 1 hour Maximum audio length Approximately 9.5 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-flash-latest Latest stable: gemini-1.5-flash Stable: gemini-1.5-flash-001 gemini-1.5-flash-002 calendar_month Latest update September 2024 Gemini 1.5 Flash-8B Gemini 1.5 Flash-8B is a small model designed for lower intelligence tasks. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-flash-8b save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 3,600 Maximum video length 1 hour Maximum audio length Approximately 9.5 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-flash-8b-latest Latest stable: gemini-1.5-flash-8b Stable: gemini-1.5-flash-8b-001 calendar_month Latest update October 2024 Gemini 1.5 Pro Try Gemini 2.5 Pro Preview , our most advanced Gemini model to date. Gemini 1.5 Pro is a mid-size multimodal model that is optimized for a wide-range of reasoning tasks. 1.5 Pro can process large amounts of data at once, including 2 hours of video, 19 hours of audio, codebases with 60,000 lines of code, or 2,000 pages of text. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-pro save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 2,097,152 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 7,200 Maximum video length 2 hours Maximum audio length Approximately 19 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-pro-latest Latest stable: gemini-1.5-pro Stable: gemini-1.5-pro-001 gemini-1.5-pro-002 calendar_month Latest update September 2024 Imagen 4 Imagen 4 is our latest image model, capable of generating highly detailed images with rich lighting, significantly better text rendering, and higher resolution output than previous models. Model details Property Description id_card Model code Gemini API imagen-4.0-generate-preview-06-06 imagen-4.0-ultra-generate-preview-06-06 save Supported data types Input Text Output Images token_auto Token limits [*] Input token limit 480 tokens (text) Output images 1 (Ultra) 1 to 4 (Standard) calendar_month Latest update June 2025 Imagen 3 Imagen 3 is our highest quality text-to-image model, capable of generating images with even better detail, richer lighting and fewer distracting artifacts than our previous models. Model details Property Description id_card Model code Gemini API imagen-3.0-generate-002 save Supported data types Input Text Output Images token_auto Token limits [*] Input token limit N/A Output images Up to 4 calendar_month Latest update February 2025 Veo 2 Veo 2 is our high quality text- and image-to-video model, capable of generating detailed videos, capturing the artistic nuance in your prompts. Model details Property Description id_card Model code Gemini API veo-2.0-generate-001 save Supported data types Input Text, image Output Video token_auto Limits Text input N/A Image input Any image resolution and aspect ratio up to 20MB file size Output video Up to 2 calendar_month Latest update April 2025 Gemini 2.5 Flash Live The Gemini 2.5 Flash Live model works with the Live API to enable low-latency bidirectional voice and video interactions with Gemini. The model can process text, audio, and video input, and it can provide text and audio output. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-live-2.5-flash-preview save Supported data types Inputs Audio, video, and text Output Text, and audio token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Supported Thinking Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-live-2.5-flash-preview calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.0 Flash Live The Gemini 2.0 Flash Live model works with the Live API to enable low-latency bidirectional voice and video interactions with Gemini. The model can process text, audio, and video input, and it can provide text and audio output. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-live-001 save Supported data types Inputs Audio, video, and text Output Text, and audio token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Supported Thinking Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.0-flash-live-001 calendar_month Latest update April 2025 cognition_2 Knowledge cutoff August 2024 Gemini Embedding Experimental Gemini embedding achieves a SOTA performance across many key dimensions including code, multi-lingual, and retrieval. Gemini Embedding rate limits are more restricted since it is an experimental model. Model details Property Description id_card Model code Gemini API gemini-embedding-exp-03-07 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 8,192 Output dimension size Elastic, supports: 3072, 1536, or 768 calendar_month Latest update March 2025 Text Embedding and Embedding Text Embedding Try our new experimental Gemini embedding model which achieves state-of-the-art performance. Text embeddings are used to measure the relatedness of strings and are widely used in many AI applications. text-embedding-004 achieves a stronger retrieval performance and outperforms existing models with comparable dimensions, on the standard MTEB embedding benchmarks. Model details Property Description id_card Model code Gemini API models/text-embedding-004 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 2,048 Output dimension size 768 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Not supported calendar_month Latest update April 2024 Embedding Note: Text Embedding is the newer version of the Embedding model. If you're creating a new project, use Text Embedding. You can use the Embedding model to generate text embeddings for input text. The Embedding model is optimized for creating embeddings with 768 dimensions for text of up to 2,048 tokens. Embedding model details Property Description id_card Model code models/embedding-001 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 2,048 Output dimension size 768 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Not supported calendar_month Latest update December 2023 AQA You can use the AQA model to perform Attributed Question-Answering (AQA)–related tasks over a document, corpus, or a set of passages. The AQA model returns answers to questions that are grounded in provided sources, along with estimating answerable probability. Model details Property Description id_card Model code models/aqa save Supported data types Input Text Output Text language Supported language English token_auto Token limits [*] Input token limit 7,168 Output token limit 1,024 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Supported calendar_month Latest update December 2023 See the examples to explore the capabilities of these model variations. [*] A token is equivalent to about 4 characters for Gemini models. 100 tokens are about 60-80 English words. Model version name patterns Gemini models are available in either stable , preview , or experimental versions. In your code, you can use one of the following model name formats to specify which model and version you want to use. Latest stable Points to the most recent stable version released for the specified model generation and variation. To specify the latest stable version, use the following pattern: -- . For example, gemini-2.0-flash . Stable Points to a specific stable model. Stable models usually don't change. Most production apps should use a specific stable model. To specify a stable version, use the following pattern: --- . For example, gemini-2.0-flash-001 . Preview Points to a preview model which may not be suitable for production use, come with more restrictive rate limits, but may have billing enabled. To specify a preview version, use the following pattern: --- . For example, gemini-2.5-pro-preview-06-05 . Experimental Points to an experimental model which may not be suitable for production use and come with more restrictive rate limits. We release experimental models to gather feedback and get our latest updates into the hands of developers quickly. To specify an experimental version, use the following pattern: --- . For example, gemini-2.0-pro-exp-02-05 . Experimental models In addition to stable models, the Gemini API offers experimental models which may not be suitable for production use and come with more restrictive rate limits. We release experimental models to gather feedback, get our latest updates into the hands of developers quickly, and highlight the pace of innovation happening at Google. What we learn from experimental launches informs how we release models more widely. An experimental model can be swapped for another without prior notice. We don't guarantee that an experimental model will become a stable model in the future. Previous experimental models As new versions or stable releases become available, we remove and replace experimental models. You can find the previous experimental models we released in the following section along with the replacement version: Model code Base model Replacement version gemini-2.5-flash-preview-04-17 Gemini 2.5 Flash gemini-2.5-flash-preview-05-20 gemini-2.0-flash-exp-image-generation Gemini 2.0 Flash gemini-2.0-flash-preview-image-generation gemini-2.5-pro-preview-06-05 Gemini 2.5 Pro gemini-2.5-pro gemini-2.5-pro-preview-05-06 Gemini 2.5 Pro gemini-2.5-pro gemini-2.5-pro-preview-03-25 Gemini 2.5 Pro gemini-2.5-pro gemini-2.0-flash-thinking-exp-01-21 Gemini 2.5 Flash gemini-2.5-flash-preview-04-17 gemini-2.0-pro-exp-02-05 Gemini 2.0 Pro Experimental gemini-2.5-pro-preview-03-25 gemini-2.0-flash-exp Gemini 2.0 Flash gemini-2.0-flash gemini-exp-1206 Gemini 2.0 Pro gemini-2.0-pro-exp-02-05 gemini-2.0-flash-thinking-exp-1219 Gemini 2.0 Flash Thinking gemini-2.0-flash-thinking-exp-01-21 gemini-exp-1121 Gemini gemini-exp-1206 gemini-exp-1114 Gemini gemini-exp-1206 gemini-1.5-pro-exp-0827 Gemini 1.5 Pro gemini-exp-1206 gemini-1.5-pro-exp-0801 Gemini 1.5 Pro gemini-exp-1206 gemini-1.5-flash-8b-exp-0924 Gemini 1.5 Flash-8B gemini-1.5-flash-8b gemini-1.5-flash-8b-exp-0827 Gemini 1.5 Flash-8B gemini-1.5-flash-8b Supported languages Gemini models are trained to work with the following languages: Arabic ( ar ) Bengali ( bn ) Bulgarian ( bg ) Chinese simplified and traditional ( zh ) Croatian ( hr ) Czech ( cs ) Danish ( da ) Dutch ( nl ) English ( en ) Estonian ( et ) Finnish ( fi ) French ( fr ) German ( de ) Greek ( el ) Hebrew ( iw ) Hindi ( hi ) Hungarian ( hu ) Indonesian ( id ) Italian ( it ) Japanese ( ja ) Korean ( ko ) Latvian ( lv ) Lithuanian ( lt ) Norwegian ( no ) Polish ( pl ) Portuguese ( pt ) Romanian ( ro ) Russian ( ru ) Serbian ( sr ) Slovak ( sk ) Slovenian ( sl ) Spanish ( es ) Swahili ( sw ) Swedish ( sv ) Thai ( th ) Turkish ( tr ) Ukrainian ( uk ) Vietnamese ( vi ) Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-07 UTC. \ No newline at end of file diff --git a/text_content/models_gemini_ca90468f.txt b/text_content/models_gemini_ca90468f.txt new file mode 100644 index 0000000000000000000000000000000000000000..4ad27833fa503fe7f81b9ed92faff9f6cc191e2d --- /dev/null +++ b/text_content/models_gemini_ca90468f.txt @@ -0,0 +1,5 @@ +URL: https://ai.google.dev/gemini-api/docs/models/gemini#gemini-2.5-flash-lite +Title: Gemini models  |  Gemini API  |  Google AI for Developers +================================================== + +Gemini models | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Gemini models 2.5 Pro spark Our most powerful thinking model with maximum response accuracy and state-of-the-art performance Input audio, images, video, and text, get text responses Tackle difficult problems, analyze large databases, and more Best for complex coding, reasoning, and multimodal understanding 2.5 Flash spark Our best model in terms of price-performance, offering well-rounded capabilities. Input audio, images, video, and text, and get text responses Model thinks as needed; or, you can configure a thinking budget Best for low latency, high volume tasks that require thinking 2.5 Flash-Lite experiment A Gemini 2.5 Flash model optimized for cost efficiency and low latency. Input audio, images, video, and text, and get text responses Most cost-efficient model supporting high throughput Best for real time, low latency use cases Note: Gemini 2.5 Pro and 2.5 Flash come with thinking on by default . If you're migrating from a non-thinking model such as 2.0 Pro or Flash, we recommend you to review the Thinking guide first. Model variants The Gemini API offers different models that are optimized for specific use cases. Here's a brief overview of Gemini variants that are available: Model variant Input(s) Output Optimized for Gemini 2.5 Pro gemini-2.5-pro Audio, images, videos, text, and PDF Text Enhanced thinking and reasoning, multimodal understanding, advanced coding, and more Gemini 2.5 Flash gemini-2.5-flash Audio, images, videos, and text Text Adaptive thinking, cost efficiency Gemini 2.5 Flash-Lite Preview gemini-2.5-flash-lite-preview-06-17 Text, image, video, audio Text Most cost-efficient model supporting high throughput Gemini 2.5 Flash Native Audio gemini-2.5-flash-preview-native-audio-dialog & gemini-2.5-flash-exp-native-audio-thinking-dialog Audio, videos, and text Text and audio, interleaved High quality, natural conversational audio outputs, with or without thinking Gemini 2.5 Flash Preview TTS gemini-2.5-flash-preview-tts Text Audio Low latency, controllable, single- and multi-speaker text-to-speech audio generation Gemini 2.5 Pro Preview TTS gemini-2.5-pro-preview-tts Text Audio Low latency, controllable, single- and multi-speaker text-to-speech audio generation Gemini 2.0 Flash gemini-2.0-flash Audio, images, videos, and text Text Next generation features, speed, and realtime streaming. Gemini 2.0 Flash Preview Image Generation gemini-2.0-flash-preview-image-generation Audio, images, videos, and text Text, images Conversational image generation and editing Gemini 2.0 Flash-Lite gemini-2.0-flash-lite Audio, images, videos, and text Text Cost efficiency and low latency Gemini 1.5 Flash gemini-1.5-flash Audio, images, videos, and text Text Fast and versatile performance across a diverse variety of tasks Gemini 1.5 Flash-8B gemini-1.5-flash-8b Audio, images, videos, and text Text High volume and lower intelligence tasks Gemini 1.5 Pro gemini-1.5-pro Audio, images, videos, and text Text Complex reasoning tasks requiring more intelligence Gemini Embedding gemini-embedding-exp Text Text embeddings Measuring the relatedness of text strings Imagen 4 imagen-4.0-generate-preview-06-06 imagen-4.0-ultra-generate-preview-06-06 Text Images Our most up-to-date image generation model Imagen 3 imagen-3.0-generate-002 Text Images High quality image generation model Veo 2 veo-2.0-generate-001 Text, images Video High quality video generation Gemini 2.5 Flash Live gemini-live-2.5-flash-preview Audio, video, and text Text, audio Low-latency bidirectional voice and video interactions Gemini 2.0 Flash Live gemini-2.0-flash-live-001 Audio, video, and text Text, audio Low-latency bidirectional voice and video interactions You can view the rate limits for each model on the rate limits page . Gemini 2.5 Pro Gemini 2.5 Pro is our state-of-the-art thinking model, capable of reasoning over complex problems in code, math, and STEM, as well as analyzing large datasets, codebases, and documents using long context. Try in Google AI Studio Model details Property Description id_card Model code gemini-2.5-pro save Supported data types Inputs Audio, images, video, text, and PDF Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 65,536 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Search grounding Supported Image generation Not supported Audio generation Not supported Live API Not supported Thinking Supported Batch API Supported 123 Versions Read the model version patterns for more details. Stable: gemini-2.5-pro Preview: gemini-2.5-pro-preview-06-05 Preview: gemini-2.5-pro-preview-05-06 Preview: gemini-2.5-pro-preview-03-25 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Our best model in terms of price-performance, offering well-rounded capabilities. 2.5 Flash is best for large scale processing, low-latency, high volume tasks that require thinking, and agentic use cases. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash save Supported data types Inputs Text, images, video, audio Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 65,536 handyman Capabilities Audio generation Not supported Caching Supported Code execution Supported Function calling Supported Image generation Not supported Search grounding Supported Structured outputs Supported Thinking Supported Tuning Not supported Batch API Supported 123 Versions Read the model version patterns for more details. Stable: gemini-2.5-flash Preview: gemini-2.5-flash-preview-05-20 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash-Lite Preview A Gemini 2.5 Flash model optimized for cost efficiency and low latency. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-lite-preview-06-17 save Supported data types Inputs Text, images, video, and audio Output Text token_auto Token limits [*] Input token limit 1,000,000 Output token limit 64,000 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported URL Context Supported Search grounding Supported Image generation Not supported Audio generation Not supported Live API Not supported Thinking Supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.5-flash-lite-preview-06-17 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Native Audio Our native audio dialog models, with and without thinking, available through the Live API . These models provide interactive and unstructured conversational experiences, with style and control prompting. Try native audio in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-preview-native-audio-dialog & models/gemini-2.5-flash-exp-native-audio-thinking-dialog save Supported data types Inputs Audio, video, text Output Audio and text token_auto Token limits [*] Input token limit 128,000 Output token limit 8,000 handyman Capabilities Audio generation Supported Caching Not supported Code execution Not supported Function calling Supported Image generation Not supported Search grounding Supported Structured outputs Not supported Thinking Supported Tuning Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.5-flash-preview-05-20 Experimental: gemini-2.5-flash-exp-native-audio-thinking-dialog calendar_month Latest update May 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Preview Text-to-Speech Gemini 2.5 Flash Preview TTS is our price-performant text-to-speech model, delivering high control and transparency for structured workflows like podcast generation, audiobooks, customer support, and more. Gemini 2.5 Flash rate limits are more restricted since it is an experimental / preview model. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-preview-tts save Supported data types Inputs Text Output Audio token_auto Token limits [*] Input token limit 8,000 Output token limit 16,000 handyman Capabilities Structured outputs Not supported Caching Not supported Tuning Not supported Function calling Not supported Code execution Not supported Search Not supported Audio generation Supported Live API Not supported Thinking Not supported 123 Versions Read the model version patterns for more details. gemini-2.5-flash-preview-tts calendar_month Latest update May 2025 Gemini 2.5 Pro Preview Text-to-Speech Gemini 2.5 Pro Preview TTS is our most powerful text-to-speech model, delivering high control and transparency for structured workflows like podcast generation, audiobooks, customer support, and more. Gemini 2.5 Pro rate limits are more restricted since it is an experimental / preview model. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-pro-preview-tts save Supported data types Inputs Text Output Audio token_auto Token limits [*] Input token limit 8,000 Output token limit 16,000 handyman Capabilities Structured outputs Not supported Caching Not supported Tuning Not supported Function calling Not supported Code execution Not supported Search Not supported Audio generation Supported Live API Not supported Thinking Not supported 123 Versions Read the model version patterns for more details. gemini-2.5-pro-preview-tts calendar_month Latest update May 2025 Gemini 2.0 Flash Gemini 2.0 Flash delivers next-gen features and improved capabilities, including superior speed, native tool use, and a 1M token context window. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Not supported Live API Supported Thinking Experimental Batch API Supported 123 Versions Read the model version patterns for more details. Latest: gemini-2.0-flash Stable: gemini-2.0-flash-001 Experimental: gemini-2.0-flash-exp calendar_month Latest update February 2025 cognition_2 Knowledge cutoff August 2024 Gemini 2.0 Flash Preview Image Generation Gemini 2.0 Flash Preview Image Generation delivers improved image generation features, including generating and editing images conversationally. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-preview-image-generation save Supported data types Inputs Audio, images, video, and text Output Text and images token_auto Token limits [*] Input token limit 32,000 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Not supported Code execution Not Supported Search Not Supported Image generation Supported Audio generation Not supported Live API Not Supported Thinking Not Supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.0-flash-preview-image-generation gemini-2.0-flash-preview-image-generation is not currently supported in a number of countries in Europe, Middle East & Africa calendar_month Latest update May 2025 cognition_2 Knowledge cutoff August 2024 Gemini 2.0 Flash-Lite A Gemini 2.0 Flash model optimized for cost efficiency and low latency. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-lite save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Not supported Search Not supported Image generation Not supported Audio generation Not supported Live API Not supported Batch API Supported 123 Versions Read the model version patterns for more details. Latest: gemini-2.0-flash-lite Stable: gemini-2.0-flash-lite-001 calendar_month Latest update February 2025 cognition_2 Knowledge cutoff August 2024 Gemini 1.5 Flash Gemini 1.5 Flash is a fast and versatile multimodal model for scaling across diverse tasks. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-flash save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 3,600 Maximum video length 1 hour Maximum audio length Approximately 9.5 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-flash-latest Latest stable: gemini-1.5-flash Stable: gemini-1.5-flash-001 gemini-1.5-flash-002 calendar_month Latest update September 2024 Gemini 1.5 Flash-8B Gemini 1.5 Flash-8B is a small model designed for lower intelligence tasks. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-flash-8b save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 3,600 Maximum video length 1 hour Maximum audio length Approximately 9.5 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-flash-8b-latest Latest stable: gemini-1.5-flash-8b Stable: gemini-1.5-flash-8b-001 calendar_month Latest update October 2024 Gemini 1.5 Pro Try Gemini 2.5 Pro Preview , our most advanced Gemini model to date. Gemini 1.5 Pro is a mid-size multimodal model that is optimized for a wide-range of reasoning tasks. 1.5 Pro can process large amounts of data at once, including 2 hours of video, 19 hours of audio, codebases with 60,000 lines of code, or 2,000 pages of text. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-pro save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 2,097,152 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 7,200 Maximum video length 2 hours Maximum audio length Approximately 19 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-pro-latest Latest stable: gemini-1.5-pro Stable: gemini-1.5-pro-001 gemini-1.5-pro-002 calendar_month Latest update September 2024 Imagen 4 Imagen 4 is our latest image model, capable of generating highly detailed images with rich lighting, significantly better text rendering, and higher resolution output than previous models. Model details Property Description id_card Model code Gemini API imagen-4.0-generate-preview-06-06 imagen-4.0-ultra-generate-preview-06-06 save Supported data types Input Text Output Images token_auto Token limits [*] Input token limit 480 tokens (text) Output images 1 (Ultra) 1 to 4 (Standard) calendar_month Latest update June 2025 Imagen 3 Imagen 3 is our highest quality text-to-image model, capable of generating images with even better detail, richer lighting and fewer distracting artifacts than our previous models. Model details Property Description id_card Model code Gemini API imagen-3.0-generate-002 save Supported data types Input Text Output Images token_auto Token limits [*] Input token limit N/A Output images Up to 4 calendar_month Latest update February 2025 Veo 2 Veo 2 is our high quality text- and image-to-video model, capable of generating detailed videos, capturing the artistic nuance in your prompts. Model details Property Description id_card Model code Gemini API veo-2.0-generate-001 save Supported data types Input Text, image Output Video token_auto Limits Text input N/A Image input Any image resolution and aspect ratio up to 20MB file size Output video Up to 2 calendar_month Latest update April 2025 Gemini 2.5 Flash Live The Gemini 2.5 Flash Live model works with the Live API to enable low-latency bidirectional voice and video interactions with Gemini. The model can process text, audio, and video input, and it can provide text and audio output. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-live-2.5-flash-preview save Supported data types Inputs Audio, video, and text Output Text, and audio token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Supported Thinking Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-live-2.5-flash-preview calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.0 Flash Live The Gemini 2.0 Flash Live model works with the Live API to enable low-latency bidirectional voice and video interactions with Gemini. The model can process text, audio, and video input, and it can provide text and audio output. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-live-001 save Supported data types Inputs Audio, video, and text Output Text, and audio token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Supported Thinking Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.0-flash-live-001 calendar_month Latest update April 2025 cognition_2 Knowledge cutoff August 2024 Gemini Embedding Experimental Gemini embedding achieves a SOTA performance across many key dimensions including code, multi-lingual, and retrieval. Gemini Embedding rate limits are more restricted since it is an experimental model. Model details Property Description id_card Model code Gemini API gemini-embedding-exp-03-07 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 8,192 Output dimension size Elastic, supports: 3072, 1536, or 768 calendar_month Latest update March 2025 Text Embedding and Embedding Text Embedding Try our new experimental Gemini embedding model which achieves state-of-the-art performance. Text embeddings are used to measure the relatedness of strings and are widely used in many AI applications. text-embedding-004 achieves a stronger retrieval performance and outperforms existing models with comparable dimensions, on the standard MTEB embedding benchmarks. Model details Property Description id_card Model code Gemini API models/text-embedding-004 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 2,048 Output dimension size 768 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Not supported calendar_month Latest update April 2024 Embedding Note: Text Embedding is the newer version of the Embedding model. If you're creating a new project, use Text Embedding. You can use the Embedding model to generate text embeddings for input text. The Embedding model is optimized for creating embeddings with 768 dimensions for text of up to 2,048 tokens. Embedding model details Property Description id_card Model code models/embedding-001 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 2,048 Output dimension size 768 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Not supported calendar_month Latest update December 2023 AQA You can use the AQA model to perform Attributed Question-Answering (AQA)–related tasks over a document, corpus, or a set of passages. The AQA model returns answers to questions that are grounded in provided sources, along with estimating answerable probability. Model details Property Description id_card Model code models/aqa save Supported data types Input Text Output Text language Supported language English token_auto Token limits [*] Input token limit 7,168 Output token limit 1,024 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Supported calendar_month Latest update December 2023 See the examples to explore the capabilities of these model variations. [*] A token is equivalent to about 4 characters for Gemini models. 100 tokens are about 60-80 English words. Model version name patterns Gemini models are available in either stable , preview , or experimental versions. In your code, you can use one of the following model name formats to specify which model and version you want to use. Latest stable Points to the most recent stable version released for the specified model generation and variation. To specify the latest stable version, use the following pattern: -- . For example, gemini-2.0-flash . Stable Points to a specific stable model. Stable models usually don't change. Most production apps should use a specific stable model. To specify a stable version, use the following pattern: --- . For example, gemini-2.0-flash-001 . Preview Points to a preview model which may not be suitable for production use, come with more restrictive rate limits, but may have billing enabled. To specify a preview version, use the following pattern: --- . For example, gemini-2.5-pro-preview-06-05 . Experimental Points to an experimental model which may not be suitable for production use and come with more restrictive rate limits. We release experimental models to gather feedback and get our latest updates into the hands of developers quickly. To specify an experimental version, use the following pattern: --- . For example, gemini-2.0-pro-exp-02-05 . Experimental models In addition to stable models, the Gemini API offers experimental models which may not be suitable for production use and come with more restrictive rate limits. We release experimental models to gather feedback, get our latest updates into the hands of developers quickly, and highlight the pace of innovation happening at Google. What we learn from experimental launches informs how we release models more widely. An experimental model can be swapped for another without prior notice. We don't guarantee that an experimental model will become a stable model in the future. Previous experimental models As new versions or stable releases become available, we remove and replace experimental models. You can find the previous experimental models we released in the following section along with the replacement version: Model code Base model Replacement version gemini-2.5-flash-preview-04-17 Gemini 2.5 Flash gemini-2.5-flash-preview-05-20 gemini-2.0-flash-exp-image-generation Gemini 2.0 Flash gemini-2.0-flash-preview-image-generation gemini-2.5-pro-preview-06-05 Gemini 2.5 Pro gemini-2.5-pro gemini-2.5-pro-preview-05-06 Gemini 2.5 Pro gemini-2.5-pro gemini-2.5-pro-preview-03-25 Gemini 2.5 Pro gemini-2.5-pro gemini-2.0-flash-thinking-exp-01-21 Gemini 2.5 Flash gemini-2.5-flash-preview-04-17 gemini-2.0-pro-exp-02-05 Gemini 2.0 Pro Experimental gemini-2.5-pro-preview-03-25 gemini-2.0-flash-exp Gemini 2.0 Flash gemini-2.0-flash gemini-exp-1206 Gemini 2.0 Pro gemini-2.0-pro-exp-02-05 gemini-2.0-flash-thinking-exp-1219 Gemini 2.0 Flash Thinking gemini-2.0-flash-thinking-exp-01-21 gemini-exp-1121 Gemini gemini-exp-1206 gemini-exp-1114 Gemini gemini-exp-1206 gemini-1.5-pro-exp-0827 Gemini 1.5 Pro gemini-exp-1206 gemini-1.5-pro-exp-0801 Gemini 1.5 Pro gemini-exp-1206 gemini-1.5-flash-8b-exp-0924 Gemini 1.5 Flash-8B gemini-1.5-flash-8b gemini-1.5-flash-8b-exp-0827 Gemini 1.5 Flash-8B gemini-1.5-flash-8b Supported languages Gemini models are trained to work with the following languages: Arabic ( ar ) Bengali ( bn ) Bulgarian ( bg ) Chinese simplified and traditional ( zh ) Croatian ( hr ) Czech ( cs ) Danish ( da ) Dutch ( nl ) English ( en ) Estonian ( et ) Finnish ( fi ) French ( fr ) German ( de ) Greek ( el ) Hebrew ( iw ) Hindi ( hi ) Hungarian ( hu ) Indonesian ( id ) Italian ( it ) Japanese ( ja ) Korean ( ko ) Latvian ( lv ) Lithuanian ( lt ) Norwegian ( no ) Polish ( pl ) Portuguese ( pt ) Romanian ( ro ) Russian ( ru ) Serbian ( sr ) Slovak ( sk ) Slovenian ( sl ) Spanish ( es ) Swahili ( sw ) Swedish ( sv ) Thai ( th ) Turkish ( tr ) Ukrainian ( uk ) Vietnamese ( vi ) Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-07 UTC. \ No newline at end of file diff --git a/text_content/models_gemini_d7e23d70.txt b/text_content/models_gemini_d7e23d70.txt new file mode 100644 index 0000000000000000000000000000000000000000..cbc7ce8b7f0683d11d522283bccac91871f96f3a --- /dev/null +++ b/text_content/models_gemini_d7e23d70.txt @@ -0,0 +1,5 @@ +URL: https://ai.google.dev/gemini-api/docs/models/gemini#gemini-2.0-flash-lite +Title: Gemini models  |  Gemini API  |  Google AI for Developers +================================================== + +Gemini models | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Gemini models 2.5 Pro spark Our most powerful thinking model with maximum response accuracy and state-of-the-art performance Input audio, images, video, and text, get text responses Tackle difficult problems, analyze large databases, and more Best for complex coding, reasoning, and multimodal understanding 2.5 Flash spark Our best model in terms of price-performance, offering well-rounded capabilities. Input audio, images, video, and text, and get text responses Model thinks as needed; or, you can configure a thinking budget Best for low latency, high volume tasks that require thinking 2.5 Flash-Lite experiment A Gemini 2.5 Flash model optimized for cost efficiency and low latency. Input audio, images, video, and text, and get text responses Most cost-efficient model supporting high throughput Best for real time, low latency use cases Note: Gemini 2.5 Pro and 2.5 Flash come with thinking on by default . If you're migrating from a non-thinking model such as 2.0 Pro or Flash, we recommend you to review the Thinking guide first. Model variants The Gemini API offers different models that are optimized for specific use cases. Here's a brief overview of Gemini variants that are available: Model variant Input(s) Output Optimized for Gemini 2.5 Pro gemini-2.5-pro Audio, images, videos, text, and PDF Text Enhanced thinking and reasoning, multimodal understanding, advanced coding, and more Gemini 2.5 Flash gemini-2.5-flash Audio, images, videos, and text Text Adaptive thinking, cost efficiency Gemini 2.5 Flash-Lite Preview gemini-2.5-flash-lite-preview-06-17 Text, image, video, audio Text Most cost-efficient model supporting high throughput Gemini 2.5 Flash Native Audio gemini-2.5-flash-preview-native-audio-dialog & gemini-2.5-flash-exp-native-audio-thinking-dialog Audio, videos, and text Text and audio, interleaved High quality, natural conversational audio outputs, with or without thinking Gemini 2.5 Flash Preview TTS gemini-2.5-flash-preview-tts Text Audio Low latency, controllable, single- and multi-speaker text-to-speech audio generation Gemini 2.5 Pro Preview TTS gemini-2.5-pro-preview-tts Text Audio Low latency, controllable, single- and multi-speaker text-to-speech audio generation Gemini 2.0 Flash gemini-2.0-flash Audio, images, videos, and text Text Next generation features, speed, and realtime streaming. Gemini 2.0 Flash Preview Image Generation gemini-2.0-flash-preview-image-generation Audio, images, videos, and text Text, images Conversational image generation and editing Gemini 2.0 Flash-Lite gemini-2.0-flash-lite Audio, images, videos, and text Text Cost efficiency and low latency Gemini 1.5 Flash gemini-1.5-flash Audio, images, videos, and text Text Fast and versatile performance across a diverse variety of tasks Gemini 1.5 Flash-8B gemini-1.5-flash-8b Audio, images, videos, and text Text High volume and lower intelligence tasks Gemini 1.5 Pro gemini-1.5-pro Audio, images, videos, and text Text Complex reasoning tasks requiring more intelligence Gemini Embedding gemini-embedding-exp Text Text embeddings Measuring the relatedness of text strings Imagen 4 imagen-4.0-generate-preview-06-06 imagen-4.0-ultra-generate-preview-06-06 Text Images Our most up-to-date image generation model Imagen 3 imagen-3.0-generate-002 Text Images High quality image generation model Veo 2 veo-2.0-generate-001 Text, images Video High quality video generation Gemini 2.5 Flash Live gemini-live-2.5-flash-preview Audio, video, and text Text, audio Low-latency bidirectional voice and video interactions Gemini 2.0 Flash Live gemini-2.0-flash-live-001 Audio, video, and text Text, audio Low-latency bidirectional voice and video interactions You can view the rate limits for each model on the rate limits page . Gemini 2.5 Pro Gemini 2.5 Pro is our state-of-the-art thinking model, capable of reasoning over complex problems in code, math, and STEM, as well as analyzing large datasets, codebases, and documents using long context. Try in Google AI Studio Model details Property Description id_card Model code gemini-2.5-pro save Supported data types Inputs Audio, images, video, text, and PDF Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 65,536 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Search grounding Supported Image generation Not supported Audio generation Not supported Live API Not supported Thinking Supported Batch API Supported 123 Versions Read the model version patterns for more details. Stable: gemini-2.5-pro Preview: gemini-2.5-pro-preview-06-05 Preview: gemini-2.5-pro-preview-05-06 Preview: gemini-2.5-pro-preview-03-25 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Our best model in terms of price-performance, offering well-rounded capabilities. 2.5 Flash is best for large scale processing, low-latency, high volume tasks that require thinking, and agentic use cases. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash save Supported data types Inputs Text, images, video, audio Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 65,536 handyman Capabilities Audio generation Not supported Caching Supported Code execution Supported Function calling Supported Image generation Not supported Search grounding Supported Structured outputs Supported Thinking Supported Tuning Not supported Batch API Supported 123 Versions Read the model version patterns for more details. Stable: gemini-2.5-flash Preview: gemini-2.5-flash-preview-05-20 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash-Lite Preview A Gemini 2.5 Flash model optimized for cost efficiency and low latency. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-lite-preview-06-17 save Supported data types Inputs Text, images, video, and audio Output Text token_auto Token limits [*] Input token limit 1,000,000 Output token limit 64,000 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported URL Context Supported Search grounding Supported Image generation Not supported Audio generation Not supported Live API Not supported Thinking Supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.5-flash-lite-preview-06-17 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Native Audio Our native audio dialog models, with and without thinking, available through the Live API . These models provide interactive and unstructured conversational experiences, with style and control prompting. Try native audio in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-preview-native-audio-dialog & models/gemini-2.5-flash-exp-native-audio-thinking-dialog save Supported data types Inputs Audio, video, text Output Audio and text token_auto Token limits [*] Input token limit 128,000 Output token limit 8,000 handyman Capabilities Audio generation Supported Caching Not supported Code execution Not supported Function calling Supported Image generation Not supported Search grounding Supported Structured outputs Not supported Thinking Supported Tuning Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.5-flash-preview-05-20 Experimental: gemini-2.5-flash-exp-native-audio-thinking-dialog calendar_month Latest update May 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Preview Text-to-Speech Gemini 2.5 Flash Preview TTS is our price-performant text-to-speech model, delivering high control and transparency for structured workflows like podcast generation, audiobooks, customer support, and more. Gemini 2.5 Flash rate limits are more restricted since it is an experimental / preview model. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-preview-tts save Supported data types Inputs Text Output Audio token_auto Token limits [*] Input token limit 8,000 Output token limit 16,000 handyman Capabilities Structured outputs Not supported Caching Not supported Tuning Not supported Function calling Not supported Code execution Not supported Search Not supported Audio generation Supported Live API Not supported Thinking Not supported 123 Versions Read the model version patterns for more details. gemini-2.5-flash-preview-tts calendar_month Latest update May 2025 Gemini 2.5 Pro Preview Text-to-Speech Gemini 2.5 Pro Preview TTS is our most powerful text-to-speech model, delivering high control and transparency for structured workflows like podcast generation, audiobooks, customer support, and more. Gemini 2.5 Pro rate limits are more restricted since it is an experimental / preview model. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-pro-preview-tts save Supported data types Inputs Text Output Audio token_auto Token limits [*] Input token limit 8,000 Output token limit 16,000 handyman Capabilities Structured outputs Not supported Caching Not supported Tuning Not supported Function calling Not supported Code execution Not supported Search Not supported Audio generation Supported Live API Not supported Thinking Not supported 123 Versions Read the model version patterns for more details. gemini-2.5-pro-preview-tts calendar_month Latest update May 2025 Gemini 2.0 Flash Gemini 2.0 Flash delivers next-gen features and improved capabilities, including superior speed, native tool use, and a 1M token context window. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Not supported Live API Supported Thinking Experimental Batch API Supported 123 Versions Read the model version patterns for more details. Latest: gemini-2.0-flash Stable: gemini-2.0-flash-001 Experimental: gemini-2.0-flash-exp calendar_month Latest update February 2025 cognition_2 Knowledge cutoff August 2024 Gemini 2.0 Flash Preview Image Generation Gemini 2.0 Flash Preview Image Generation delivers improved image generation features, including generating and editing images conversationally. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-preview-image-generation save Supported data types Inputs Audio, images, video, and text Output Text and images token_auto Token limits [*] Input token limit 32,000 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Not supported Code execution Not Supported Search Not Supported Image generation Supported Audio generation Not supported Live API Not Supported Thinking Not Supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.0-flash-preview-image-generation gemini-2.0-flash-preview-image-generation is not currently supported in a number of countries in Europe, Middle East & Africa calendar_month Latest update May 2025 cognition_2 Knowledge cutoff August 2024 Gemini 2.0 Flash-Lite A Gemini 2.0 Flash model optimized for cost efficiency and low latency. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-lite save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Not supported Search Not supported Image generation Not supported Audio generation Not supported Live API Not supported Batch API Supported 123 Versions Read the model version patterns for more details. Latest: gemini-2.0-flash-lite Stable: gemini-2.0-flash-lite-001 calendar_month Latest update February 2025 cognition_2 Knowledge cutoff August 2024 Gemini 1.5 Flash Gemini 1.5 Flash is a fast and versatile multimodal model for scaling across diverse tasks. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-flash save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 3,600 Maximum video length 1 hour Maximum audio length Approximately 9.5 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-flash-latest Latest stable: gemini-1.5-flash Stable: gemini-1.5-flash-001 gemini-1.5-flash-002 calendar_month Latest update September 2024 Gemini 1.5 Flash-8B Gemini 1.5 Flash-8B is a small model designed for lower intelligence tasks. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-flash-8b save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 3,600 Maximum video length 1 hour Maximum audio length Approximately 9.5 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-flash-8b-latest Latest stable: gemini-1.5-flash-8b Stable: gemini-1.5-flash-8b-001 calendar_month Latest update October 2024 Gemini 1.5 Pro Try Gemini 2.5 Pro Preview , our most advanced Gemini model to date. Gemini 1.5 Pro is a mid-size multimodal model that is optimized for a wide-range of reasoning tasks. 1.5 Pro can process large amounts of data at once, including 2 hours of video, 19 hours of audio, codebases with 60,000 lines of code, or 2,000 pages of text. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-pro save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 2,097,152 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 7,200 Maximum video length 2 hours Maximum audio length Approximately 19 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-pro-latest Latest stable: gemini-1.5-pro Stable: gemini-1.5-pro-001 gemini-1.5-pro-002 calendar_month Latest update September 2024 Imagen 4 Imagen 4 is our latest image model, capable of generating highly detailed images with rich lighting, significantly better text rendering, and higher resolution output than previous models. Model details Property Description id_card Model code Gemini API imagen-4.0-generate-preview-06-06 imagen-4.0-ultra-generate-preview-06-06 save Supported data types Input Text Output Images token_auto Token limits [*] Input token limit 480 tokens (text) Output images 1 (Ultra) 1 to 4 (Standard) calendar_month Latest update June 2025 Imagen 3 Imagen 3 is our highest quality text-to-image model, capable of generating images with even better detail, richer lighting and fewer distracting artifacts than our previous models. Model details Property Description id_card Model code Gemini API imagen-3.0-generate-002 save Supported data types Input Text Output Images token_auto Token limits [*] Input token limit N/A Output images Up to 4 calendar_month Latest update February 2025 Veo 2 Veo 2 is our high quality text- and image-to-video model, capable of generating detailed videos, capturing the artistic nuance in your prompts. Model details Property Description id_card Model code Gemini API veo-2.0-generate-001 save Supported data types Input Text, image Output Video token_auto Limits Text input N/A Image input Any image resolution and aspect ratio up to 20MB file size Output video Up to 2 calendar_month Latest update April 2025 Gemini 2.5 Flash Live The Gemini 2.5 Flash Live model works with the Live API to enable low-latency bidirectional voice and video interactions with Gemini. The model can process text, audio, and video input, and it can provide text and audio output. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-live-2.5-flash-preview save Supported data types Inputs Audio, video, and text Output Text, and audio token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Supported Thinking Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-live-2.5-flash-preview calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.0 Flash Live The Gemini 2.0 Flash Live model works with the Live API to enable low-latency bidirectional voice and video interactions with Gemini. The model can process text, audio, and video input, and it can provide text and audio output. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-live-001 save Supported data types Inputs Audio, video, and text Output Text, and audio token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Supported Thinking Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.0-flash-live-001 calendar_month Latest update April 2025 cognition_2 Knowledge cutoff August 2024 Gemini Embedding Experimental Gemini embedding achieves a SOTA performance across many key dimensions including code, multi-lingual, and retrieval. Gemini Embedding rate limits are more restricted since it is an experimental model. Model details Property Description id_card Model code Gemini API gemini-embedding-exp-03-07 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 8,192 Output dimension size Elastic, supports: 3072, 1536, or 768 calendar_month Latest update March 2025 Text Embedding and Embedding Text Embedding Try our new experimental Gemini embedding model which achieves state-of-the-art performance. Text embeddings are used to measure the relatedness of strings and are widely used in many AI applications. text-embedding-004 achieves a stronger retrieval performance and outperforms existing models with comparable dimensions, on the standard MTEB embedding benchmarks. Model details Property Description id_card Model code Gemini API models/text-embedding-004 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 2,048 Output dimension size 768 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Not supported calendar_month Latest update April 2024 Embedding Note: Text Embedding is the newer version of the Embedding model. If you're creating a new project, use Text Embedding. You can use the Embedding model to generate text embeddings for input text. The Embedding model is optimized for creating embeddings with 768 dimensions for text of up to 2,048 tokens. Embedding model details Property Description id_card Model code models/embedding-001 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 2,048 Output dimension size 768 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Not supported calendar_month Latest update December 2023 AQA You can use the AQA model to perform Attributed Question-Answering (AQA)–related tasks over a document, corpus, or a set of passages. The AQA model returns answers to questions that are grounded in provided sources, along with estimating answerable probability. Model details Property Description id_card Model code models/aqa save Supported data types Input Text Output Text language Supported language English token_auto Token limits [*] Input token limit 7,168 Output token limit 1,024 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Supported calendar_month Latest update December 2023 See the examples to explore the capabilities of these model variations. [*] A token is equivalent to about 4 characters for Gemini models. 100 tokens are about 60-80 English words. Model version name patterns Gemini models are available in either stable , preview , or experimental versions. In your code, you can use one of the following model name formats to specify which model and version you want to use. Latest stable Points to the most recent stable version released for the specified model generation and variation. To specify the latest stable version, use the following pattern: -- . For example, gemini-2.0-flash . Stable Points to a specific stable model. Stable models usually don't change. Most production apps should use a specific stable model. To specify a stable version, use the following pattern: --- . For example, gemini-2.0-flash-001 . Preview Points to a preview model which may not be suitable for production use, come with more restrictive rate limits, but may have billing enabled. To specify a preview version, use the following pattern: --- . For example, gemini-2.5-pro-preview-06-05 . Experimental Points to an experimental model which may not be suitable for production use and come with more restrictive rate limits. We release experimental models to gather feedback and get our latest updates into the hands of developers quickly. To specify an experimental version, use the following pattern: --- . For example, gemini-2.0-pro-exp-02-05 . Experimental models In addition to stable models, the Gemini API offers experimental models which may not be suitable for production use and come with more restrictive rate limits. We release experimental models to gather feedback, get our latest updates into the hands of developers quickly, and highlight the pace of innovation happening at Google. What we learn from experimental launches informs how we release models more widely. An experimental model can be swapped for another without prior notice. We don't guarantee that an experimental model will become a stable model in the future. Previous experimental models As new versions or stable releases become available, we remove and replace experimental models. You can find the previous experimental models we released in the following section along with the replacement version: Model code Base model Replacement version gemini-2.5-flash-preview-04-17 Gemini 2.5 Flash gemini-2.5-flash-preview-05-20 gemini-2.0-flash-exp-image-generation Gemini 2.0 Flash gemini-2.0-flash-preview-image-generation gemini-2.5-pro-preview-06-05 Gemini 2.5 Pro gemini-2.5-pro gemini-2.5-pro-preview-05-06 Gemini 2.5 Pro gemini-2.5-pro gemini-2.5-pro-preview-03-25 Gemini 2.5 Pro gemini-2.5-pro gemini-2.0-flash-thinking-exp-01-21 Gemini 2.5 Flash gemini-2.5-flash-preview-04-17 gemini-2.0-pro-exp-02-05 Gemini 2.0 Pro Experimental gemini-2.5-pro-preview-03-25 gemini-2.0-flash-exp Gemini 2.0 Flash gemini-2.0-flash gemini-exp-1206 Gemini 2.0 Pro gemini-2.0-pro-exp-02-05 gemini-2.0-flash-thinking-exp-1219 Gemini 2.0 Flash Thinking gemini-2.0-flash-thinking-exp-01-21 gemini-exp-1121 Gemini gemini-exp-1206 gemini-exp-1114 Gemini gemini-exp-1206 gemini-1.5-pro-exp-0827 Gemini 1.5 Pro gemini-exp-1206 gemini-1.5-pro-exp-0801 Gemini 1.5 Pro gemini-exp-1206 gemini-1.5-flash-8b-exp-0924 Gemini 1.5 Flash-8B gemini-1.5-flash-8b gemini-1.5-flash-8b-exp-0827 Gemini 1.5 Flash-8B gemini-1.5-flash-8b Supported languages Gemini models are trained to work with the following languages: Arabic ( ar ) Bengali ( bn ) Bulgarian ( bg ) Chinese simplified and traditional ( zh ) Croatian ( hr ) Czech ( cs ) Danish ( da ) Dutch ( nl ) English ( en ) Estonian ( et ) Finnish ( fi ) French ( fr ) German ( de ) Greek ( el ) Hebrew ( iw ) Hindi ( hi ) Hungarian ( hu ) Indonesian ( id ) Italian ( it ) Japanese ( ja ) Korean ( ko ) Latvian ( lv ) Lithuanian ( lt ) Norwegian ( no ) Polish ( pl ) Portuguese ( pt ) Romanian ( ro ) Russian ( ru ) Serbian ( sr ) Slovak ( sk ) Slovenian ( sl ) Spanish ( es ) Swahili ( sw ) Swedish ( sv ) Thai ( th ) Turkish ( tr ) Ukrainian ( uk ) Vietnamese ( vi ) Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-07 UTC. \ No newline at end of file diff --git a/text_content/models_gemini_dec0db6c.txt b/text_content/models_gemini_dec0db6c.txt new file mode 100644 index 0000000000000000000000000000000000000000..c23211d8e7bb70097f2815e4b7f0f8a26f74609a --- /dev/null +++ b/text_content/models_gemini_dec0db6c.txt @@ -0,0 +1,5 @@ +URL: https://ai.google.dev/gemini-api/docs/models/gemini#gemini-1.5-pro +Title: Gemini models  |  Gemini API  |  Google AI for Developers +================================================== + +Gemini models | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Gemini models 2.5 Pro spark Our most powerful thinking model with maximum response accuracy and state-of-the-art performance Input audio, images, video, and text, get text responses Tackle difficult problems, analyze large databases, and more Best for complex coding, reasoning, and multimodal understanding 2.5 Flash spark Our best model in terms of price-performance, offering well-rounded capabilities. Input audio, images, video, and text, and get text responses Model thinks as needed; or, you can configure a thinking budget Best for low latency, high volume tasks that require thinking 2.5 Flash-Lite experiment A Gemini 2.5 Flash model optimized for cost efficiency and low latency. Input audio, images, video, and text, and get text responses Most cost-efficient model supporting high throughput Best for real time, low latency use cases Note: Gemini 2.5 Pro and 2.5 Flash come with thinking on by default . If you're migrating from a non-thinking model such as 2.0 Pro or Flash, we recommend you to review the Thinking guide first. Model variants The Gemini API offers different models that are optimized for specific use cases. Here's a brief overview of Gemini variants that are available: Model variant Input(s) Output Optimized for Gemini 2.5 Pro gemini-2.5-pro Audio, images, videos, text, and PDF Text Enhanced thinking and reasoning, multimodal understanding, advanced coding, and more Gemini 2.5 Flash gemini-2.5-flash Audio, images, videos, and text Text Adaptive thinking, cost efficiency Gemini 2.5 Flash-Lite Preview gemini-2.5-flash-lite-preview-06-17 Text, image, video, audio Text Most cost-efficient model supporting high throughput Gemini 2.5 Flash Native Audio gemini-2.5-flash-preview-native-audio-dialog & gemini-2.5-flash-exp-native-audio-thinking-dialog Audio, videos, and text Text and audio, interleaved High quality, natural conversational audio outputs, with or without thinking Gemini 2.5 Flash Preview TTS gemini-2.5-flash-preview-tts Text Audio Low latency, controllable, single- and multi-speaker text-to-speech audio generation Gemini 2.5 Pro Preview TTS gemini-2.5-pro-preview-tts Text Audio Low latency, controllable, single- and multi-speaker text-to-speech audio generation Gemini 2.0 Flash gemini-2.0-flash Audio, images, videos, and text Text Next generation features, speed, and realtime streaming. Gemini 2.0 Flash Preview Image Generation gemini-2.0-flash-preview-image-generation Audio, images, videos, and text Text, images Conversational image generation and editing Gemini 2.0 Flash-Lite gemini-2.0-flash-lite Audio, images, videos, and text Text Cost efficiency and low latency Gemini 1.5 Flash gemini-1.5-flash Audio, images, videos, and text Text Fast and versatile performance across a diverse variety of tasks Gemini 1.5 Flash-8B gemini-1.5-flash-8b Audio, images, videos, and text Text High volume and lower intelligence tasks Gemini 1.5 Pro gemini-1.5-pro Audio, images, videos, and text Text Complex reasoning tasks requiring more intelligence Gemini Embedding gemini-embedding-exp Text Text embeddings Measuring the relatedness of text strings Imagen 4 imagen-4.0-generate-preview-06-06 imagen-4.0-ultra-generate-preview-06-06 Text Images Our most up-to-date image generation model Imagen 3 imagen-3.0-generate-002 Text Images High quality image generation model Veo 2 veo-2.0-generate-001 Text, images Video High quality video generation Gemini 2.5 Flash Live gemini-live-2.5-flash-preview Audio, video, and text Text, audio Low-latency bidirectional voice and video interactions Gemini 2.0 Flash Live gemini-2.0-flash-live-001 Audio, video, and text Text, audio Low-latency bidirectional voice and video interactions You can view the rate limits for each model on the rate limits page . Gemini 2.5 Pro Gemini 2.5 Pro is our state-of-the-art thinking model, capable of reasoning over complex problems in code, math, and STEM, as well as analyzing large datasets, codebases, and documents using long context. Try in Google AI Studio Model details Property Description id_card Model code gemini-2.5-pro save Supported data types Inputs Audio, images, video, text, and PDF Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 65,536 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Search grounding Supported Image generation Not supported Audio generation Not supported Live API Not supported Thinking Supported Batch API Supported 123 Versions Read the model version patterns for more details. Stable: gemini-2.5-pro Preview: gemini-2.5-pro-preview-06-05 Preview: gemini-2.5-pro-preview-05-06 Preview: gemini-2.5-pro-preview-03-25 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Our best model in terms of price-performance, offering well-rounded capabilities. 2.5 Flash is best for large scale processing, low-latency, high volume tasks that require thinking, and agentic use cases. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash save Supported data types Inputs Text, images, video, audio Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 65,536 handyman Capabilities Audio generation Not supported Caching Supported Code execution Supported Function calling Supported Image generation Not supported Search grounding Supported Structured outputs Supported Thinking Supported Tuning Not supported Batch API Supported 123 Versions Read the model version patterns for more details. Stable: gemini-2.5-flash Preview: gemini-2.5-flash-preview-05-20 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash-Lite Preview A Gemini 2.5 Flash model optimized for cost efficiency and low latency. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-lite-preview-06-17 save Supported data types Inputs Text, images, video, and audio Output Text token_auto Token limits [*] Input token limit 1,000,000 Output token limit 64,000 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported URL Context Supported Search grounding Supported Image generation Not supported Audio generation Not supported Live API Not supported Thinking Supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.5-flash-lite-preview-06-17 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Native Audio Our native audio dialog models, with and without thinking, available through the Live API . These models provide interactive and unstructured conversational experiences, with style and control prompting. Try native audio in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-preview-native-audio-dialog & models/gemini-2.5-flash-exp-native-audio-thinking-dialog save Supported data types Inputs Audio, video, text Output Audio and text token_auto Token limits [*] Input token limit 128,000 Output token limit 8,000 handyman Capabilities Audio generation Supported Caching Not supported Code execution Not supported Function calling Supported Image generation Not supported Search grounding Supported Structured outputs Not supported Thinking Supported Tuning Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.5-flash-preview-05-20 Experimental: gemini-2.5-flash-exp-native-audio-thinking-dialog calendar_month Latest update May 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Preview Text-to-Speech Gemini 2.5 Flash Preview TTS is our price-performant text-to-speech model, delivering high control and transparency for structured workflows like podcast generation, audiobooks, customer support, and more. Gemini 2.5 Flash rate limits are more restricted since it is an experimental / preview model. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-preview-tts save Supported data types Inputs Text Output Audio token_auto Token limits [*] Input token limit 8,000 Output token limit 16,000 handyman Capabilities Structured outputs Not supported Caching Not supported Tuning Not supported Function calling Not supported Code execution Not supported Search Not supported Audio generation Supported Live API Not supported Thinking Not supported 123 Versions Read the model version patterns for more details. gemini-2.5-flash-preview-tts calendar_month Latest update May 2025 Gemini 2.5 Pro Preview Text-to-Speech Gemini 2.5 Pro Preview TTS is our most powerful text-to-speech model, delivering high control and transparency for structured workflows like podcast generation, audiobooks, customer support, and more. Gemini 2.5 Pro rate limits are more restricted since it is an experimental / preview model. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-pro-preview-tts save Supported data types Inputs Text Output Audio token_auto Token limits [*] Input token limit 8,000 Output token limit 16,000 handyman Capabilities Structured outputs Not supported Caching Not supported Tuning Not supported Function calling Not supported Code execution Not supported Search Not supported Audio generation Supported Live API Not supported Thinking Not supported 123 Versions Read the model version patterns for more details. gemini-2.5-pro-preview-tts calendar_month Latest update May 2025 Gemini 2.0 Flash Gemini 2.0 Flash delivers next-gen features and improved capabilities, including superior speed, native tool use, and a 1M token context window. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Not supported Live API Supported Thinking Experimental Batch API Supported 123 Versions Read the model version patterns for more details. Latest: gemini-2.0-flash Stable: gemini-2.0-flash-001 Experimental: gemini-2.0-flash-exp calendar_month Latest update February 2025 cognition_2 Knowledge cutoff August 2024 Gemini 2.0 Flash Preview Image Generation Gemini 2.0 Flash Preview Image Generation delivers improved image generation features, including generating and editing images conversationally. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-preview-image-generation save Supported data types Inputs Audio, images, video, and text Output Text and images token_auto Token limits [*] Input token limit 32,000 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Not supported Code execution Not Supported Search Not Supported Image generation Supported Audio generation Not supported Live API Not Supported Thinking Not Supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.0-flash-preview-image-generation gemini-2.0-flash-preview-image-generation is not currently supported in a number of countries in Europe, Middle East & Africa calendar_month Latest update May 2025 cognition_2 Knowledge cutoff August 2024 Gemini 2.0 Flash-Lite A Gemini 2.0 Flash model optimized for cost efficiency and low latency. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-lite save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Not supported Search Not supported Image generation Not supported Audio generation Not supported Live API Not supported Batch API Supported 123 Versions Read the model version patterns for more details. Latest: gemini-2.0-flash-lite Stable: gemini-2.0-flash-lite-001 calendar_month Latest update February 2025 cognition_2 Knowledge cutoff August 2024 Gemini 1.5 Flash Gemini 1.5 Flash is a fast and versatile multimodal model for scaling across diverse tasks. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-flash save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 3,600 Maximum video length 1 hour Maximum audio length Approximately 9.5 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-flash-latest Latest stable: gemini-1.5-flash Stable: gemini-1.5-flash-001 gemini-1.5-flash-002 calendar_month Latest update September 2024 Gemini 1.5 Flash-8B Gemini 1.5 Flash-8B is a small model designed for lower intelligence tasks. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-flash-8b save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 3,600 Maximum video length 1 hour Maximum audio length Approximately 9.5 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-flash-8b-latest Latest stable: gemini-1.5-flash-8b Stable: gemini-1.5-flash-8b-001 calendar_month Latest update October 2024 Gemini 1.5 Pro Try Gemini 2.5 Pro Preview , our most advanced Gemini model to date. Gemini 1.5 Pro is a mid-size multimodal model that is optimized for a wide-range of reasoning tasks. 1.5 Pro can process large amounts of data at once, including 2 hours of video, 19 hours of audio, codebases with 60,000 lines of code, or 2,000 pages of text. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-pro save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 2,097,152 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 7,200 Maximum video length 2 hours Maximum audio length Approximately 19 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-pro-latest Latest stable: gemini-1.5-pro Stable: gemini-1.5-pro-001 gemini-1.5-pro-002 calendar_month Latest update September 2024 Imagen 4 Imagen 4 is our latest image model, capable of generating highly detailed images with rich lighting, significantly better text rendering, and higher resolution output than previous models. Model details Property Description id_card Model code Gemini API imagen-4.0-generate-preview-06-06 imagen-4.0-ultra-generate-preview-06-06 save Supported data types Input Text Output Images token_auto Token limits [*] Input token limit 480 tokens (text) Output images 1 (Ultra) 1 to 4 (Standard) calendar_month Latest update June 2025 Imagen 3 Imagen 3 is our highest quality text-to-image model, capable of generating images with even better detail, richer lighting and fewer distracting artifacts than our previous models. Model details Property Description id_card Model code Gemini API imagen-3.0-generate-002 save Supported data types Input Text Output Images token_auto Token limits [*] Input token limit N/A Output images Up to 4 calendar_month Latest update February 2025 Veo 2 Veo 2 is our high quality text- and image-to-video model, capable of generating detailed videos, capturing the artistic nuance in your prompts. Model details Property Description id_card Model code Gemini API veo-2.0-generate-001 save Supported data types Input Text, image Output Video token_auto Limits Text input N/A Image input Any image resolution and aspect ratio up to 20MB file size Output video Up to 2 calendar_month Latest update April 2025 Gemini 2.5 Flash Live The Gemini 2.5 Flash Live model works with the Live API to enable low-latency bidirectional voice and video interactions with Gemini. The model can process text, audio, and video input, and it can provide text and audio output. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-live-2.5-flash-preview save Supported data types Inputs Audio, video, and text Output Text, and audio token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Supported Thinking Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-live-2.5-flash-preview calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.0 Flash Live The Gemini 2.0 Flash Live model works with the Live API to enable low-latency bidirectional voice and video interactions with Gemini. The model can process text, audio, and video input, and it can provide text and audio output. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-live-001 save Supported data types Inputs Audio, video, and text Output Text, and audio token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Supported Thinking Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.0-flash-live-001 calendar_month Latest update April 2025 cognition_2 Knowledge cutoff August 2024 Gemini Embedding Experimental Gemini embedding achieves a SOTA performance across many key dimensions including code, multi-lingual, and retrieval. Gemini Embedding rate limits are more restricted since it is an experimental model. Model details Property Description id_card Model code Gemini API gemini-embedding-exp-03-07 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 8,192 Output dimension size Elastic, supports: 3072, 1536, or 768 calendar_month Latest update March 2025 Text Embedding and Embedding Text Embedding Try our new experimental Gemini embedding model which achieves state-of-the-art performance. Text embeddings are used to measure the relatedness of strings and are widely used in many AI applications. text-embedding-004 achieves a stronger retrieval performance and outperforms existing models with comparable dimensions, on the standard MTEB embedding benchmarks. Model details Property Description id_card Model code Gemini API models/text-embedding-004 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 2,048 Output dimension size 768 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Not supported calendar_month Latest update April 2024 Embedding Note: Text Embedding is the newer version of the Embedding model. If you're creating a new project, use Text Embedding. You can use the Embedding model to generate text embeddings for input text. The Embedding model is optimized for creating embeddings with 768 dimensions for text of up to 2,048 tokens. Embedding model details Property Description id_card Model code models/embedding-001 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 2,048 Output dimension size 768 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Not supported calendar_month Latest update December 2023 AQA You can use the AQA model to perform Attributed Question-Answering (AQA)–related tasks over a document, corpus, or a set of passages. The AQA model returns answers to questions that are grounded in provided sources, along with estimating answerable probability. Model details Property Description id_card Model code models/aqa save Supported data types Input Text Output Text language Supported language English token_auto Token limits [*] Input token limit 7,168 Output token limit 1,024 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Supported calendar_month Latest update December 2023 See the examples to explore the capabilities of these model variations. [*] A token is equivalent to about 4 characters for Gemini models. 100 tokens are about 60-80 English words. Model version name patterns Gemini models are available in either stable , preview , or experimental versions. In your code, you can use one of the following model name formats to specify which model and version you want to use. Latest stable Points to the most recent stable version released for the specified model generation and variation. To specify the latest stable version, use the following pattern: -- . For example, gemini-2.0-flash . Stable Points to a specific stable model. Stable models usually don't change. Most production apps should use a specific stable model. To specify a stable version, use the following pattern: --- . For example, gemini-2.0-flash-001 . Preview Points to a preview model which may not be suitable for production use, come with more restrictive rate limits, but may have billing enabled. To specify a preview version, use the following pattern: --- . For example, gemini-2.5-pro-preview-06-05 . Experimental Points to an experimental model which may not be suitable for production use and come with more restrictive rate limits. We release experimental models to gather feedback and get our latest updates into the hands of developers quickly. To specify an experimental version, use the following pattern: --- . For example, gemini-2.0-pro-exp-02-05 . Experimental models In addition to stable models, the Gemini API offers experimental models which may not be suitable for production use and come with more restrictive rate limits. We release experimental models to gather feedback, get our latest updates into the hands of developers quickly, and highlight the pace of innovation happening at Google. What we learn from experimental launches informs how we release models more widely. An experimental model can be swapped for another without prior notice. We don't guarantee that an experimental model will become a stable model in the future. Previous experimental models As new versions or stable releases become available, we remove and replace experimental models. You can find the previous experimental models we released in the following section along with the replacement version: Model code Base model Replacement version gemini-2.5-flash-preview-04-17 Gemini 2.5 Flash gemini-2.5-flash-preview-05-20 gemini-2.0-flash-exp-image-generation Gemini 2.0 Flash gemini-2.0-flash-preview-image-generation gemini-2.5-pro-preview-06-05 Gemini 2.5 Pro gemini-2.5-pro gemini-2.5-pro-preview-05-06 Gemini 2.5 Pro gemini-2.5-pro gemini-2.5-pro-preview-03-25 Gemini 2.5 Pro gemini-2.5-pro gemini-2.0-flash-thinking-exp-01-21 Gemini 2.5 Flash gemini-2.5-flash-preview-04-17 gemini-2.0-pro-exp-02-05 Gemini 2.0 Pro Experimental gemini-2.5-pro-preview-03-25 gemini-2.0-flash-exp Gemini 2.0 Flash gemini-2.0-flash gemini-exp-1206 Gemini 2.0 Pro gemini-2.0-pro-exp-02-05 gemini-2.0-flash-thinking-exp-1219 Gemini 2.0 Flash Thinking gemini-2.0-flash-thinking-exp-01-21 gemini-exp-1121 Gemini gemini-exp-1206 gemini-exp-1114 Gemini gemini-exp-1206 gemini-1.5-pro-exp-0827 Gemini 1.5 Pro gemini-exp-1206 gemini-1.5-pro-exp-0801 Gemini 1.5 Pro gemini-exp-1206 gemini-1.5-flash-8b-exp-0924 Gemini 1.5 Flash-8B gemini-1.5-flash-8b gemini-1.5-flash-8b-exp-0827 Gemini 1.5 Flash-8B gemini-1.5-flash-8b Supported languages Gemini models are trained to work with the following languages: Arabic ( ar ) Bengali ( bn ) Bulgarian ( bg ) Chinese simplified and traditional ( zh ) Croatian ( hr ) Czech ( cs ) Danish ( da ) Dutch ( nl ) English ( en ) Estonian ( et ) Finnish ( fi ) French ( fr ) German ( de ) Greek ( el ) Hebrew ( iw ) Hindi ( hi ) Hungarian ( hu ) Indonesian ( id ) Italian ( it ) Japanese ( ja ) Korean ( ko ) Latvian ( lv ) Lithuanian ( lt ) Norwegian ( no ) Polish ( pl ) Portuguese ( pt ) Romanian ( ro ) Russian ( ru ) Serbian ( sr ) Slovak ( sk ) Slovenian ( sl ) Spanish ( es ) Swahili ( sw ) Swedish ( sv ) Thai ( th ) Turkish ( tr ) Ukrainian ( uk ) Vietnamese ( vi ) Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-07 UTC. \ No newline at end of file diff --git a/text_content/models_gemini_fc9defd2.txt b/text_content/models_gemini_fc9defd2.txt new file mode 100644 index 0000000000000000000000000000000000000000..4c252a98a9188f094866efbf5375a3b18dc81aa6 --- /dev/null +++ b/text_content/models_gemini_fc9defd2.txt @@ -0,0 +1,5 @@ +URL: https://ai.google.dev/gemini-api/docs/models/gemini#gemini-2.5-flash-native-audio +Title: Gemini models  |  Gemini API  |  Google AI for Developers +================================================== + +Gemini models | Gemini API | Google AI for Developers Skip to main content / English Deutsch Español – América Latina Français Indonesia Italiano Polski Português – Brasil Shqip Tiếng Việt Türkçe Русский עברית العربيّة فارسی हिंदी বাংলা ภาษาไทย 中文 – 简体 中文 – 繁體 日本語 한국어 Sign in Introducing Batch Mode, with higher rate limits and a 50% token discount. Learn more Home Gemini API Models Send feedback Gemini models 2.5 Pro spark Our most powerful thinking model with maximum response accuracy and state-of-the-art performance Input audio, images, video, and text, get text responses Tackle difficult problems, analyze large databases, and more Best for complex coding, reasoning, and multimodal understanding 2.5 Flash spark Our best model in terms of price-performance, offering well-rounded capabilities. Input audio, images, video, and text, and get text responses Model thinks as needed; or, you can configure a thinking budget Best for low latency, high volume tasks that require thinking 2.5 Flash-Lite experiment A Gemini 2.5 Flash model optimized for cost efficiency and low latency. Input audio, images, video, and text, and get text responses Most cost-efficient model supporting high throughput Best for real time, low latency use cases Note: Gemini 2.5 Pro and 2.5 Flash come with thinking on by default . If you're migrating from a non-thinking model such as 2.0 Pro or Flash, we recommend you to review the Thinking guide first. Model variants The Gemini API offers different models that are optimized for specific use cases. Here's a brief overview of Gemini variants that are available: Model variant Input(s) Output Optimized for Gemini 2.5 Pro gemini-2.5-pro Audio, images, videos, text, and PDF Text Enhanced thinking and reasoning, multimodal understanding, advanced coding, and more Gemini 2.5 Flash gemini-2.5-flash Audio, images, videos, and text Text Adaptive thinking, cost efficiency Gemini 2.5 Flash-Lite Preview gemini-2.5-flash-lite-preview-06-17 Text, image, video, audio Text Most cost-efficient model supporting high throughput Gemini 2.5 Flash Native Audio gemini-2.5-flash-preview-native-audio-dialog & gemini-2.5-flash-exp-native-audio-thinking-dialog Audio, videos, and text Text and audio, interleaved High quality, natural conversational audio outputs, with or without thinking Gemini 2.5 Flash Preview TTS gemini-2.5-flash-preview-tts Text Audio Low latency, controllable, single- and multi-speaker text-to-speech audio generation Gemini 2.5 Pro Preview TTS gemini-2.5-pro-preview-tts Text Audio Low latency, controllable, single- and multi-speaker text-to-speech audio generation Gemini 2.0 Flash gemini-2.0-flash Audio, images, videos, and text Text Next generation features, speed, and realtime streaming. Gemini 2.0 Flash Preview Image Generation gemini-2.0-flash-preview-image-generation Audio, images, videos, and text Text, images Conversational image generation and editing Gemini 2.0 Flash-Lite gemini-2.0-flash-lite Audio, images, videos, and text Text Cost efficiency and low latency Gemini 1.5 Flash gemini-1.5-flash Audio, images, videos, and text Text Fast and versatile performance across a diverse variety of tasks Gemini 1.5 Flash-8B gemini-1.5-flash-8b Audio, images, videos, and text Text High volume and lower intelligence tasks Gemini 1.5 Pro gemini-1.5-pro Audio, images, videos, and text Text Complex reasoning tasks requiring more intelligence Gemini Embedding gemini-embedding-exp Text Text embeddings Measuring the relatedness of text strings Imagen 4 imagen-4.0-generate-preview-06-06 imagen-4.0-ultra-generate-preview-06-06 Text Images Our most up-to-date image generation model Imagen 3 imagen-3.0-generate-002 Text Images High quality image generation model Veo 2 veo-2.0-generate-001 Text, images Video High quality video generation Gemini 2.5 Flash Live gemini-live-2.5-flash-preview Audio, video, and text Text, audio Low-latency bidirectional voice and video interactions Gemini 2.0 Flash Live gemini-2.0-flash-live-001 Audio, video, and text Text, audio Low-latency bidirectional voice and video interactions You can view the rate limits for each model on the rate limits page . Gemini 2.5 Pro Gemini 2.5 Pro is our state-of-the-art thinking model, capable of reasoning over complex problems in code, math, and STEM, as well as analyzing large datasets, codebases, and documents using long context. Try in Google AI Studio Model details Property Description id_card Model code gemini-2.5-pro save Supported data types Inputs Audio, images, video, text, and PDF Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 65,536 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Search grounding Supported Image generation Not supported Audio generation Not supported Live API Not supported Thinking Supported Batch API Supported 123 Versions Read the model version patterns for more details. Stable: gemini-2.5-pro Preview: gemini-2.5-pro-preview-06-05 Preview: gemini-2.5-pro-preview-05-06 Preview: gemini-2.5-pro-preview-03-25 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Our best model in terms of price-performance, offering well-rounded capabilities. 2.5 Flash is best for large scale processing, low-latency, high volume tasks that require thinking, and agentic use cases. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash save Supported data types Inputs Text, images, video, audio Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 65,536 handyman Capabilities Audio generation Not supported Caching Supported Code execution Supported Function calling Supported Image generation Not supported Search grounding Supported Structured outputs Supported Thinking Supported Tuning Not supported Batch API Supported 123 Versions Read the model version patterns for more details. Stable: gemini-2.5-flash Preview: gemini-2.5-flash-preview-05-20 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash-Lite Preview A Gemini 2.5 Flash model optimized for cost efficiency and low latency. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-lite-preview-06-17 save Supported data types Inputs Text, images, video, and audio Output Text token_auto Token limits [*] Input token limit 1,000,000 Output token limit 64,000 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported URL Context Supported Search grounding Supported Image generation Not supported Audio generation Not supported Live API Not supported Thinking Supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.5-flash-lite-preview-06-17 calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Native Audio Our native audio dialog models, with and without thinking, available through the Live API . These models provide interactive and unstructured conversational experiences, with style and control prompting. Try native audio in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-preview-native-audio-dialog & models/gemini-2.5-flash-exp-native-audio-thinking-dialog save Supported data types Inputs Audio, video, text Output Audio and text token_auto Token limits [*] Input token limit 128,000 Output token limit 8,000 handyman Capabilities Audio generation Supported Caching Not supported Code execution Not supported Function calling Supported Image generation Not supported Search grounding Supported Structured outputs Not supported Thinking Supported Tuning Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.5-flash-preview-05-20 Experimental: gemini-2.5-flash-exp-native-audio-thinking-dialog calendar_month Latest update May 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.5 Flash Preview Text-to-Speech Gemini 2.5 Flash Preview TTS is our price-performant text-to-speech model, delivering high control and transparency for structured workflows like podcast generation, audiobooks, customer support, and more. Gemini 2.5 Flash rate limits are more restricted since it is an experimental / preview model. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-flash-preview-tts save Supported data types Inputs Text Output Audio token_auto Token limits [*] Input token limit 8,000 Output token limit 16,000 handyman Capabilities Structured outputs Not supported Caching Not supported Tuning Not supported Function calling Not supported Code execution Not supported Search Not supported Audio generation Supported Live API Not supported Thinking Not supported 123 Versions Read the model version patterns for more details. gemini-2.5-flash-preview-tts calendar_month Latest update May 2025 Gemini 2.5 Pro Preview Text-to-Speech Gemini 2.5 Pro Preview TTS is our most powerful text-to-speech model, delivering high control and transparency for structured workflows like podcast generation, audiobooks, customer support, and more. Gemini 2.5 Pro rate limits are more restricted since it is an experimental / preview model. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.5-pro-preview-tts save Supported data types Inputs Text Output Audio token_auto Token limits [*] Input token limit 8,000 Output token limit 16,000 handyman Capabilities Structured outputs Not supported Caching Not supported Tuning Not supported Function calling Not supported Code execution Not supported Search Not supported Audio generation Supported Live API Not supported Thinking Not supported 123 Versions Read the model version patterns for more details. gemini-2.5-pro-preview-tts calendar_month Latest update May 2025 Gemini 2.0 Flash Gemini 2.0 Flash delivers next-gen features and improved capabilities, including superior speed, native tool use, and a 1M token context window. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Not supported Live API Supported Thinking Experimental Batch API Supported 123 Versions Read the model version patterns for more details. Latest: gemini-2.0-flash Stable: gemini-2.0-flash-001 Experimental: gemini-2.0-flash-exp calendar_month Latest update February 2025 cognition_2 Knowledge cutoff August 2024 Gemini 2.0 Flash Preview Image Generation Gemini 2.0 Flash Preview Image Generation delivers improved image generation features, including generating and editing images conversationally. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-preview-image-generation save Supported data types Inputs Audio, images, video, and text Output Text and images token_auto Token limits [*] Input token limit 32,000 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Not supported Code execution Not Supported Search Not Supported Image generation Supported Audio generation Not supported Live API Not Supported Thinking Not Supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.0-flash-preview-image-generation gemini-2.0-flash-preview-image-generation is not currently supported in a number of countries in Europe, Middle East & Africa calendar_month Latest update May 2025 cognition_2 Knowledge cutoff August 2024 Gemini 2.0 Flash-Lite A Gemini 2.0 Flash model optimized for cost efficiency and low latency. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-lite save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Caching Supported Tuning Not supported Function calling Supported Code execution Not supported Search Not supported Image generation Not supported Audio generation Not supported Live API Not supported Batch API Supported 123 Versions Read the model version patterns for more details. Latest: gemini-2.0-flash-lite Stable: gemini-2.0-flash-lite-001 calendar_month Latest update February 2025 cognition_2 Knowledge cutoff August 2024 Gemini 1.5 Flash Gemini 1.5 Flash is a fast and versatile multimodal model for scaling across diverse tasks. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-flash save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 3,600 Maximum video length 1 hour Maximum audio length Approximately 9.5 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-flash-latest Latest stable: gemini-1.5-flash Stable: gemini-1.5-flash-001 gemini-1.5-flash-002 calendar_month Latest update September 2024 Gemini 1.5 Flash-8B Gemini 1.5 Flash-8B is a small model designed for lower intelligence tasks. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-flash-8b save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 3,600 Maximum video length 1 hour Maximum audio length Approximately 9.5 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-flash-8b-latest Latest stable: gemini-1.5-flash-8b Stable: gemini-1.5-flash-8b-001 calendar_month Latest update October 2024 Gemini 1.5 Pro Try Gemini 2.5 Pro Preview , our most advanced Gemini model to date. Gemini 1.5 Pro is a mid-size multimodal model that is optimized for a wide-range of reasoning tasks. 1.5 Pro can process large amounts of data at once, including 2 hours of video, 19 hours of audio, codebases with 60,000 lines of code, or 2,000 pages of text. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-1.5-pro save Supported data types Inputs Audio, images, video, and text Output Text token_auto Token limits [*] Input token limit 2,097,152 Output token limit 8,192 movie_info Audio/visual specs Maximum number of images per prompt 7,200 Maximum video length 2 hours Maximum audio length Approximately 19 hours handyman Capabilities System instructions Supported JSON mode Supported JSON schema Supported Adjustable safety settings Supported Caching Supported Tuning Not supported Function calling Supported Code execution Supported Live API Not supported 123 Versions Read the model version patterns for more details. Latest: gemini-1.5-pro-latest Latest stable: gemini-1.5-pro Stable: gemini-1.5-pro-001 gemini-1.5-pro-002 calendar_month Latest update September 2024 Imagen 4 Imagen 4 is our latest image model, capable of generating highly detailed images with rich lighting, significantly better text rendering, and higher resolution output than previous models. Model details Property Description id_card Model code Gemini API imagen-4.0-generate-preview-06-06 imagen-4.0-ultra-generate-preview-06-06 save Supported data types Input Text Output Images token_auto Token limits [*] Input token limit 480 tokens (text) Output images 1 (Ultra) 1 to 4 (Standard) calendar_month Latest update June 2025 Imagen 3 Imagen 3 is our highest quality text-to-image model, capable of generating images with even better detail, richer lighting and fewer distracting artifacts than our previous models. Model details Property Description id_card Model code Gemini API imagen-3.0-generate-002 save Supported data types Input Text Output Images token_auto Token limits [*] Input token limit N/A Output images Up to 4 calendar_month Latest update February 2025 Veo 2 Veo 2 is our high quality text- and image-to-video model, capable of generating detailed videos, capturing the artistic nuance in your prompts. Model details Property Description id_card Model code Gemini API veo-2.0-generate-001 save Supported data types Input Text, image Output Video token_auto Limits Text input N/A Image input Any image resolution and aspect ratio up to 20MB file size Output video Up to 2 calendar_month Latest update April 2025 Gemini 2.5 Flash Live The Gemini 2.5 Flash Live model works with the Live API to enable low-latency bidirectional voice and video interactions with Gemini. The model can process text, audio, and video input, and it can provide text and audio output. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-live-2.5-flash-preview save Supported data types Inputs Audio, video, and text Output Text, and audio token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Supported Thinking Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-live-2.5-flash-preview calendar_month Latest update June 2025 cognition_2 Knowledge cutoff January 2025 Gemini 2.0 Flash Live The Gemini 2.0 Flash Live model works with the Live API to enable low-latency bidirectional voice and video interactions with Gemini. The model can process text, audio, and video input, and it can provide text and audio output. Try in Google AI Studio Model details Property Description id_card Model code models/gemini-2.0-flash-live-001 save Supported data types Inputs Audio, video, and text Output Text, and audio token_auto Token limits [*] Input token limit 1,048,576 Output token limit 8,192 handyman Capabilities Structured outputs Supported Tuning Not supported Function calling Supported Code execution Supported Search Supported Image generation Not supported Audio generation Supported Thinking Not supported 123 Versions Read the model version patterns for more details. Preview: gemini-2.0-flash-live-001 calendar_month Latest update April 2025 cognition_2 Knowledge cutoff August 2024 Gemini Embedding Experimental Gemini embedding achieves a SOTA performance across many key dimensions including code, multi-lingual, and retrieval. Gemini Embedding rate limits are more restricted since it is an experimental model. Model details Property Description id_card Model code Gemini API gemini-embedding-exp-03-07 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 8,192 Output dimension size Elastic, supports: 3072, 1536, or 768 calendar_month Latest update March 2025 Text Embedding and Embedding Text Embedding Try our new experimental Gemini embedding model which achieves state-of-the-art performance. Text embeddings are used to measure the relatedness of strings and are widely used in many AI applications. text-embedding-004 achieves a stronger retrieval performance and outperforms existing models with comparable dimensions, on the standard MTEB embedding benchmarks. Model details Property Description id_card Model code Gemini API models/text-embedding-004 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 2,048 Output dimension size 768 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Not supported calendar_month Latest update April 2024 Embedding Note: Text Embedding is the newer version of the Embedding model. If you're creating a new project, use Text Embedding. You can use the Embedding model to generate text embeddings for input text. The Embedding model is optimized for creating embeddings with 768 dimensions for text of up to 2,048 tokens. Embedding model details Property Description id_card Model code models/embedding-001 save Supported data types Input Text Output Text embeddings token_auto Token limits [*] Input token limit 2,048 Output dimension size 768 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Not supported calendar_month Latest update December 2023 AQA You can use the AQA model to perform Attributed Question-Answering (AQA)–related tasks over a document, corpus, or a set of passages. The AQA model returns answers to questions that are grounded in provided sources, along with estimating answerable probability. Model details Property Description id_card Model code models/aqa save Supported data types Input Text Output Text language Supported language English token_auto Token limits [*] Input token limit 7,168 Output token limit 1,024 swap_driving_apps_wheel Rate limits [**] 1,500 requests per minute encrypted Adjustable safety settings Supported calendar_month Latest update December 2023 See the examples to explore the capabilities of these model variations. [*] A token is equivalent to about 4 characters for Gemini models. 100 tokens are about 60-80 English words. Model version name patterns Gemini models are available in either stable , preview , or experimental versions. In your code, you can use one of the following model name formats to specify which model and version you want to use. Latest stable Points to the most recent stable version released for the specified model generation and variation. To specify the latest stable version, use the following pattern: -- . For example, gemini-2.0-flash . Stable Points to a specific stable model. Stable models usually don't change. Most production apps should use a specific stable model. To specify a stable version, use the following pattern: --- . For example, gemini-2.0-flash-001 . Preview Points to a preview model which may not be suitable for production use, come with more restrictive rate limits, but may have billing enabled. To specify a preview version, use the following pattern: --- . For example, gemini-2.5-pro-preview-06-05 . Experimental Points to an experimental model which may not be suitable for production use and come with more restrictive rate limits. We release experimental models to gather feedback and get our latest updates into the hands of developers quickly. To specify an experimental version, use the following pattern: --- . For example, gemini-2.0-pro-exp-02-05 . Experimental models In addition to stable models, the Gemini API offers experimental models which may not be suitable for production use and come with more restrictive rate limits. We release experimental models to gather feedback, get our latest updates into the hands of developers quickly, and highlight the pace of innovation happening at Google. What we learn from experimental launches informs how we release models more widely. An experimental model can be swapped for another without prior notice. We don't guarantee that an experimental model will become a stable model in the future. Previous experimental models As new versions or stable releases become available, we remove and replace experimental models. You can find the previous experimental models we released in the following section along with the replacement version: Model code Base model Replacement version gemini-2.5-flash-preview-04-17 Gemini 2.5 Flash gemini-2.5-flash-preview-05-20 gemini-2.0-flash-exp-image-generation Gemini 2.0 Flash gemini-2.0-flash-preview-image-generation gemini-2.5-pro-preview-06-05 Gemini 2.5 Pro gemini-2.5-pro gemini-2.5-pro-preview-05-06 Gemini 2.5 Pro gemini-2.5-pro gemini-2.5-pro-preview-03-25 Gemini 2.5 Pro gemini-2.5-pro gemini-2.0-flash-thinking-exp-01-21 Gemini 2.5 Flash gemini-2.5-flash-preview-04-17 gemini-2.0-pro-exp-02-05 Gemini 2.0 Pro Experimental gemini-2.5-pro-preview-03-25 gemini-2.0-flash-exp Gemini 2.0 Flash gemini-2.0-flash gemini-exp-1206 Gemini 2.0 Pro gemini-2.0-pro-exp-02-05 gemini-2.0-flash-thinking-exp-1219 Gemini 2.0 Flash Thinking gemini-2.0-flash-thinking-exp-01-21 gemini-exp-1121 Gemini gemini-exp-1206 gemini-exp-1114 Gemini gemini-exp-1206 gemini-1.5-pro-exp-0827 Gemini 1.5 Pro gemini-exp-1206 gemini-1.5-pro-exp-0801 Gemini 1.5 Pro gemini-exp-1206 gemini-1.5-flash-8b-exp-0924 Gemini 1.5 Flash-8B gemini-1.5-flash-8b gemini-1.5-flash-8b-exp-0827 Gemini 1.5 Flash-8B gemini-1.5-flash-8b Supported languages Gemini models are trained to work with the following languages: Arabic ( ar ) Bengali ( bn ) Bulgarian ( bg ) Chinese simplified and traditional ( zh ) Croatian ( hr ) Czech ( cs ) Danish ( da ) Dutch ( nl ) English ( en ) Estonian ( et ) Finnish ( fi ) French ( fr ) German ( de ) Greek ( el ) Hebrew ( iw ) Hindi ( hi ) Hungarian ( hu ) Indonesian ( id ) Italian ( it ) Japanese ( ja ) Korean ( ko ) Latvian ( lv ) Lithuanian ( lt ) Norwegian ( no ) Polish ( pl ) Portuguese ( pt ) Romanian ( ro ) Russian ( ru ) Serbian ( sr ) Slovak ( sk ) Slovenian ( sl ) Spanish ( es ) Swahili ( sw ) Swedish ( sv ) Thai ( th ) Turkish ( tr ) Ukrainian ( uk ) Vietnamese ( vi ) Send feedback Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License , and code samples are licensed under the Apache 2.0 License . For details, see the Google Developers Site Policies . Java is a registered trademark of Oracle and/or its affiliates. Last updated 2025-07-07 UTC. \ No newline at end of file