outsu commited on
Commit
2ddd289
·
verified ·
1 Parent(s): 5f1d35d

First TeLVE space is deployed with v1.0 model.

Browse files
.gitattributes CHANGED
@@ -1,35 +1,36 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
- *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
- *.ftz filter=lfs diff=lfs merge=lfs -text
7
- *.gz filter=lfs diff=lfs merge=lfs -text
8
- *.h5 filter=lfs diff=lfs merge=lfs -text
9
- *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
- *.model filter=lfs diff=lfs merge=lfs -text
13
- *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
- *.onnx filter=lfs diff=lfs merge=lfs -text
17
- *.ot filter=lfs diff=lfs merge=lfs -text
18
- *.parquet filter=lfs diff=lfs merge=lfs -text
19
- *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
- *.pt filter=lfs diff=lfs merge=lfs -text
23
- *.pth filter=lfs diff=lfs merge=lfs -text
24
- *.rar filter=lfs diff=lfs merge=lfs -text
25
- *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
- *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
- *.tflite filter=lfs diff=lfs merge=lfs -text
30
- *.tgz filter=lfs diff=lfs merge=lfs -text
31
- *.wasm filter=lfs diff=lfs merge=lfs -text
32
- *.xz filter=lfs diff=lfs merge=lfs -text
33
- *.zip filter=lfs diff=lfs merge=lfs -text
34
- *.zst filter=lfs diff=lfs merge=lfs -text
35
- *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ TeLVE/images/mugla.jpg filter=lfs diff=lfs merge=lfs -text
README.md CHANGED
@@ -1,14 +1,34 @@
1
- ---
2
- title: TeLVE
3
- emoji: 🏢
4
- colorFrom: pink
5
- colorTo: yellow
6
- sdk: gradio
7
- sdk_version: 5.9.0
8
- app_file: app.py
9
- pinned: false
10
- license: cc-by-4.0
11
- short_description: A space interface for run TeLVE, first Turkish VLM ever.
12
- ---
13
-
14
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: TeLVE
3
+ emoji: 🧿
4
+ colorFrom: blue
5
+ colorTo: green
6
+ sdk: gradio
7
+ sdk_version: 5.9.0
8
+ app_file: app.py
9
+ pinned: false
10
+ license: cc-by-4.0
11
+ short_description: A space interface for run TeLVE, first Turkish VLM ever.
12
+ ---
13
+
14
+ # TeLVE - Turkish efficient Language Vision Engine 🧿
15
+
16
+
17
+ ![TeLVE logo](<https://huggingface.co/outsu/TeLVE/resolve/main/teLVE_logo.png>)
18
+
19
+ **TeLVE** is the first Turkish Visual Language Model designed to generate descriptive captions in Turkish for images. You can use TeLVE models online with this space.
20
+
21
+ For more details, you can browse the [TeLVE model page on Hugging Face](https://huggingface.co/outsu/TeLVE).
22
+
23
+ ## How to Use
24
+
25
+ 1. **Upload an Image**: Click on "Upload Image" to select a file.
26
+ 2. **Select a Model**: Choose a model from the dropdown menu.
27
+ 3. **Generate Caption**: Press "Submit" to see the generated caption.
28
+
29
+ ## License
30
+
31
+ This project is licensed under the [CC BY 4.0 License](https://creativecommons.org/licenses/by/4.0/).
32
+
33
+
34
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
TeLVE/README.md ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: cc-by-4.0
3
+ language:
4
+ - en
5
+ - tr
6
+ tags:
7
+ - VLM
8
+ - image2text
9
+ - lm
10
+ ---
11
+ # TeLVE: Turkish efficient Language Vision Engine 🧿
12
+ [![License: CC BY 4.0](https://img.shields.io/badge/License-CC%20BY%204.0-lightgrey.svg)](https://creativecommons.org/licenses/by/4.0/)
13
+ [![Models: v1.0](https://img.shields.io/badge/Models-v1.0%2c%20v1.0dep-blue)](https://huggingface.co/outsu/TeLVE)
14
+ ## First Turkish VLM ever!
15
+
16
+ TeLVE is the first Visual Language Model specifically designed for Turkish language understanding and image description generation. Built on Vision Transformer (ViT) and BERT pre-trained encoder architectures, it bridges the gap in Turkish visual-linguistic processing.
17
+ No module named 'imagine'
18
+ ![TeLVE logo](<teLVE_logo.png>)
19
+
20
+ ## Model Description
21
+
22
+ TeLVE combines:
23
+ - 🖼️ Vision Transformer (ViT-base-patch16-224)
24
+ - 📝 Turkish BERT (dbmdz/bert-base-turkish-cased)
25
+ - 🔄 Cross-attention mechanism for vision-language fusion
26
+
27
+ ### Version Logs
28
+ - **TeLVE v1.0**: Trained on Unsplash Lite dataset
29
+ - **TeLVE v1.0dep**: Dataset enhanced with selective images from Pexels images, the encoder problem with letter "ü" was fixed. *(Deprecated, performance was decreased because of dataset addressing problem. Not recommended to use.)*
30
+
31
+ ## Usage
32
+
33
+ The model can be used in two ways:
34
+
35
+ ### Inference (imagine.py)
36
+ ```python
37
+ # Generate captions for images
38
+ python imagine.py
39
+ ```
40
+ This script:
41
+ - Loads a trained TeLVE model
42
+ - Takes images from `images` directory
43
+ - Generates Turkish captions for each image
44
+ - Outputs the results to console
45
+
46
+ ### Training (main.py)
47
+ Users can train their own models with ViT and BERT encoders.
48
+ ```python
49
+ # Train a new model
50
+ python main.py
51
+ ```
52
+
53
+ This script:
54
+ - Loads and preprocesses image-caption pairs
55
+ - Initializes ViT and BERT encoders
56
+ - Trains the combined model
57
+ - Saves the model and tokenizer
58
+
59
+
60
+ ## Performance
61
+ Performance scores will be evaluated.
62
+ <!--
63
+ | Model Version | Dataset | BLEU-4 | METEOR | CIDEr |
64
+ |--------------|---------|---------|---------|--------|
65
+ | TeLVE v1.0 | Unsplash | *TBD* | *TBD* | *TBD* |
66
+ | TeLVE v1.1 | Unsplash+Pexels | *TBD* | *TBD* | *TBD* |-->
67
+
68
+ ## Citation
69
+
70
+ ```bibtex
71
+ @software{telve2024,
72
+ author = {Öğüt Su Karagün},
73
+ title = {TeLVE: Turkish efficient Language Vision Engine},
74
+ year = {2024},
75
+ url = {https://huggingface.co/outsu/TeLVE}
76
+ }
77
+ ```
78
+
79
+ ## License
80
+ This work is licensed under a [Creative Commons Attribution 4.0 International License](http://creativecommons.org/licenses/by/4.0/).
TeLVE/datasets/TeLVE_v1.0.tsv000 ADDED
The diff for this file is too large to render. See raw diff
 
TeLVE/images/mugla.jpg ADDED

Git LFS Details

  • SHA256: 65b8124c02dc5afefbfd1ac848b633c67301215a3dc4f9c0b8c84790572bf7ea
  • Pointer size: 132 Bytes
  • Size of remote file: 4.32 MB
TeLVE/imagine.py ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ from torchvision import transforms
4
+ from transformers import ViTModel, BertTokenizerFast, BertConfig, BertLMHeadModel
5
+ from PIL import Image
6
+ import os
7
+
8
+ # Check if CUDA is available
9
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
10
+ print(f"Using device: {device}")
11
+
12
+ # Define constants
13
+ VIT_MODEL_NAME = "google/vit-base-patch16-224"
14
+ BERT_MODEL_NAME = "dbmdz/bert-base-turkish-cased"
15
+ MAX_LENGTH = 128
16
+
17
+ class ImageCaptioningModel(nn.Module):
18
+ def __init__(self, vit_model, bert_model):
19
+ super(ImageCaptioningModel, self).__init__()
20
+ self.vit = vit_model
21
+ self.bert = bert_model
22
+ self.linear = nn.Linear(self.vit.config.hidden_size, self.bert.config.hidden_size)
23
+
24
+ def forward(self, pixel_values, input_ids, attention_mask, labels=None):
25
+ image_features = self.vit(pixel_values).last_hidden_state
26
+ image_features = self.linear(image_features)
27
+
28
+ outputs = self.bert(input_ids=input_ids,
29
+ attention_mask=attention_mask,
30
+ encoder_hidden_states=image_features,
31
+ labels=labels,
32
+ return_dict=True)
33
+
34
+ return outputs.loss, outputs.logits
35
+
36
+ def load_model(model_path):
37
+ # Initialize the model components
38
+ vit_model = ViTModel.from_pretrained(VIT_MODEL_NAME)
39
+ bert_config = BertConfig.from_pretrained(BERT_MODEL_NAME)
40
+ bert_config.is_decoder = True
41
+ bert_config.add_cross_attention = True
42
+ bert_model = BertLMHeadModel.from_pretrained(BERT_MODEL_NAME, config=bert_config)
43
+
44
+ # Create the combined model
45
+ model = ImageCaptioningModel(vit_model, bert_model)
46
+ model.load_state_dict(torch.load(model_path, map_location=device))
47
+ model.to(device)
48
+ model.eval()
49
+ return model
50
+
51
+ def generate_caption(model, image_path, tokenizer):
52
+ # Prepare the image
53
+ transform = transforms.Compose([
54
+ transforms.Resize((224, 224)),
55
+ transforms.ToTensor(),
56
+ transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
57
+ ])
58
+ image = Image.open(image_path).convert('RGB')
59
+ image = transform(image).unsqueeze(0).to(device)
60
+
61
+ # Generate the caption
62
+ with torch.no_grad():
63
+ input_ids = torch.tensor([[tokenizer.cls_token_id]]).to(device)
64
+ attention_mask = torch.tensor([[1]]).to(device)
65
+
66
+ for _ in range(MAX_LENGTH):
67
+ _, logits = model(image, input_ids, attention_mask)
68
+ next_token = logits[:, -1, :].argmax(dim=-1)
69
+
70
+ if next_token.item() == tokenizer.sep_token_id:
71
+ break
72
+
73
+ input_ids = torch.cat([input_ids, next_token.unsqueeze(0)], dim=1)
74
+ attention_mask = torch.cat([attention_mask, torch.tensor([[1]]).to(device)], dim=1)
75
+
76
+ caption = tokenizer.decode(input_ids[0], skip_special_tokens=True)
77
+ return caption
78
+
79
+ def main():
80
+ model_path = "./models/TeLVE_v1.1.pth"
81
+ tokenizer_path = "./tokenizer"
82
+
83
+ # Check if the model and tokenizer exist
84
+ if not os.path.exists(model_path) or not os.path.exists(tokenizer_path):
85
+ print("Model or tokenizer not found. Please make sure you have trained the model and saved it correctly.")
86
+ return
87
+
88
+ # Load the model and tokenizer
89
+ model = load_model(model_path)
90
+ tokenizer = BertTokenizerFast.from_pretrained(tokenizer_path)
91
+
92
+ # Generate captions for images in a specified directory
93
+ image_dir = "./images" # Change this to the directory containing your test images
94
+ for image_file in os.listdir(image_dir):
95
+ if image_file.lower().endswith(('.png', '.jpg', '.jpeg')):
96
+ image_path = os.path.join(image_dir, image_file)
97
+ caption = generate_caption(model, image_path, tokenizer)
98
+ print(f"Image: {image_file}")
99
+ print(f"Generated Caption: {caption}")
100
+ print("---")
101
+
102
+ if __name__ == "__main__":
103
+ main()
TeLVE/main.py ADDED
@@ -0,0 +1,167 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import torch
3
+ import torch.nn as nn
4
+ from torch.utils.data import Dataset, DataLoader
5
+ from torchvision import transforms
6
+ from transformers import ViTModel, BertTokenizerFast, BertConfig, BertLMHeadModel, AdamW
7
+ from PIL import Image, ImageFile
8
+ import pandas as pd
9
+ from tqdm import tqdm
10
+
11
+ # Increase the maximum image size limit to avoid DecompressionBombWarning
12
+ Image.MAX_IMAGE_PIXELS = None
13
+ # Allow loading truncated images
14
+ ImageFile.LOAD_TRUNCATED_IMAGES = True
15
+
16
+ # Check if CUDA is available
17
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
18
+ print(f"Using device: {device}")
19
+
20
+ # Define constants
21
+ VIT_MODEL_NAME = "google/vit-base-patch16-224"
22
+ BERT_MODEL_NAME = "dbmdz/bert-base-turkish-cased" # Using a Turkish BERT model
23
+ model = "TeLVE_v1.0.pth"
24
+ MAX_LENGTH = 128
25
+ BATCH_SIZE = 8
26
+ EPOCHS = 5
27
+ LEARNING_RATE = 2e-5
28
+
29
+ class ImageCaptioningDataset(Dataset):
30
+ def __init__(self, dataframe, img_dir, tokenizer):
31
+ self.dataframe = dataframe
32
+ self.img_dir = img_dir
33
+ self.tokenizer = tokenizer
34
+ self.transform = transforms.Compose([
35
+ transforms.Resize((224, 224)),
36
+ transforms.ToTensor(),
37
+ transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
38
+ ])
39
+
40
+ def __len__(self):
41
+ return len(self.dataframe)
42
+
43
+ def __getitem__(self, idx):
44
+ row = self.dataframe.iloc[idx]
45
+ img_path = os.path.join(self.img_dir, row['photo_id'] + ".jpg")
46
+
47
+ try:
48
+ image = Image.open(img_path).convert('RGB')
49
+ image = self.transform(image)
50
+ except (FileNotFoundError, IOError):
51
+ # Return None if the image is not found or cannot be opened
52
+ return None
53
+
54
+ caption = row['ai_description']
55
+
56
+ # Check if caption is a valid string
57
+ if not isinstance(caption, str):
58
+ return None # Skip the example if caption is not valid
59
+
60
+ encoding = self.tokenizer(
61
+ caption,
62
+ add_special_tokens=True,
63
+ max_length=MAX_LENGTH,
64
+ padding='max_length',
65
+ truncation=True,
66
+ return_attention_mask=True,
67
+ return_tensors='pt'
68
+ )
69
+
70
+ return {
71
+ 'pixel_values': image,
72
+ 'input_ids': encoding['input_ids'].squeeze(),
73
+ 'attention_mask': encoding['attention_mask'].squeeze(),
74
+ 'labels': encoding['input_ids'].squeeze() # Use input_ids as labels for calculating loss
75
+ }
76
+
77
+
78
+ class ImageCaptioningModel(nn.Module):
79
+ def __init__(self, vit_model, bert_model):
80
+ super(ImageCaptioningModel, self).__init__()
81
+ self.vit = vit_model
82
+ self.bert = bert_model
83
+ self.linear = nn.Linear(self.vit.config.hidden_size, self.bert.config.hidden_size)
84
+
85
+ def forward(self, pixel_values, input_ids, attention_mask, labels=None):
86
+ image_features = self.vit(pixel_values).last_hidden_state
87
+ image_features = self.linear(image_features)
88
+
89
+ outputs = self.bert(input_ids=input_ids,
90
+ attention_mask=attention_mask,
91
+ encoder_hidden_states=image_features,
92
+ labels=labels,
93
+ return_dict=True)
94
+
95
+ return outputs.loss, outputs.logits
96
+
97
+ def collate_fn(batch):
98
+ # Filter out None values (skipped images)
99
+ batch = list(filter(lambda x: x is not None, batch))
100
+ if len(batch) == 0:
101
+ return None
102
+ return {key: torch.stack([item[key] for item in batch]) for key in batch[0]}
103
+
104
+ def train_vlm_model():
105
+ # Load and preprocess the dataset
106
+ encodings = ['utf-8', 'iso-8859-9', 'windows-1254']
107
+ for encoding in encodings:
108
+ try:
109
+ df = pd.read_csv('./datasets/' + model + '.tsv000', sep='\t', encoding=encoding)
110
+ print(f"Successfully read the file with {encoding} encoding.")
111
+ break
112
+ except UnicodeDecodeError:
113
+ print(f"Failed to read with {encoding} encoding. Trying next...")
114
+ else:
115
+ raise ValueError("Could not read the file with any of the specified encodings.")
116
+
117
+ # Initialize the tokenizer
118
+ tokenizer = BertTokenizerFast.from_pretrained(BERT_MODEL_NAME)
119
+
120
+ # Create the dataset and dataloader
121
+ dataset = ImageCaptioningDataset(df, '../download/images', tokenizer)
122
+ dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)
123
+
124
+ # Initialize the model components
125
+ vit_model = ViTModel.from_pretrained(VIT_MODEL_NAME)
126
+ bert_config = BertConfig.from_pretrained(BERT_MODEL_NAME)
127
+ bert_config.is_decoder = True
128
+ bert_config.add_cross_attention = True
129
+ bert_model = BertLMHeadModel.from_pretrained(BERT_MODEL_NAME, config=bert_config)
130
+
131
+ # Create the combined model
132
+ model = ImageCaptioningModel(vit_model, bert_model)
133
+ model.to(device)
134
+
135
+ # Define optimizer
136
+ optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)
137
+
138
+ # Training loop
139
+ model.train()
140
+ for epoch in range(EPOCHS):
141
+ total_loss = 0
142
+ progress_bar = tqdm(dataloader, desc=f"Epoch {epoch+1}/{EPOCHS}")
143
+ for batch in progress_bar:
144
+ if batch is None:
145
+ continue
146
+
147
+ pixel_values = batch['pixel_values'].to(device)
148
+ input_ids = batch['input_ids'].to(device)
149
+ attention_mask = batch['attention_mask'].to(device)
150
+ labels = batch['labels'].to(device)
151
+
152
+ optimizer.zero_grad()
153
+ loss, _ = model(pixel_values, input_ids, attention_mask, labels)
154
+ loss.backward()
155
+ optimizer.step()
156
+
157
+ total_loss += loss.item()
158
+ progress_bar.set_postfix({'loss': loss.item()})
159
+
160
+ print(f"Epoch {epoch+1}/{EPOCHS}, Average Loss: {total_loss/len(dataloader)}")
161
+
162
+ # Save the model
163
+ torch.save(model.state_dict(), "./models/" + model)
164
+ tokenizer.save_pretrained("./tokenizer")
165
+
166
+ if __name__ == "__main__":
167
+ train_vlm_model()
TeLVE/models/TeLVE_v1.0.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c79764aa75a603efead82246db2078c4d2c07edbdf218ec8719f7817f5728c68
3
+ size 904212666
TeLVE/teLVE_logo.png ADDED
TeLVE/tokenizer/special_tokens_map.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": "[CLS]",
3
+ "mask_token": "[MASK]",
4
+ "pad_token": "[PAD]",
5
+ "sep_token": "[SEP]",
6
+ "unk_token": "[UNK]"
7
+ }
TeLVE/tokenizer/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
TeLVE/tokenizer/tokenizer_config.json ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "[PAD]",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "[UNK]",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "2": {
20
+ "content": "[CLS]",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "3": {
28
+ "content": "[SEP]",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "4": {
36
+ "content": "[MASK]",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ }
43
+ },
44
+ "clean_up_tokenization_spaces": true,
45
+ "cls_token": "[CLS]",
46
+ "do_basic_tokenize": true,
47
+ "do_lower_case": false,
48
+ "mask_token": "[MASK]",
49
+ "max_len": 512,
50
+ "model_max_length": 512,
51
+ "never_split": null,
52
+ "pad_token": "[PAD]",
53
+ "sep_token": "[SEP]",
54
+ "strip_accents": null,
55
+ "tokenize_chinese_chars": true,
56
+ "tokenizer_class": "BertTokenizer",
57
+ "unk_token": "[UNK]"
58
+ }
TeLVE/tokenizer/vocab.txt ADDED
The diff for this file is too large to render. See raw diff
 
app.py ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import gradio as gr
3
+ import torch
4
+ from TeLVE.imagine import ImageCaptioningModel, load_model, generate_caption
5
+ from transformers import BertTokenizerFast
6
+
7
+ # Constants
8
+ MODELS_DIR = "./TeLVE/models"
9
+ TOKENIZER_PATH = "./TeLVE/tokenizer"
10
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
11
+
12
+ def list_available_models():
13
+ """List all .pth models in the models directory"""
14
+ if not os.path.exists(MODELS_DIR):
15
+ return []
16
+ return [f for f in os.listdir(MODELS_DIR) if f.endswith('.pth')]
17
+
18
+ def generate_description(image, model_name):
19
+ """Generate image caption using selected model"""
20
+ try:
21
+ # Load model and tokenizer
22
+ model_path = os.path.join(MODELS_DIR, model_name)
23
+ if not os.path.exists(model_path):
24
+ return "Error: Selected model file not found."
25
+
26
+ if not os.path.exists(TOKENIZER_PATH):
27
+ return "Error: Tokenizer not found. Please make sure you have trained a model first."
28
+
29
+ model = load_model(model_path)
30
+ tokenizer = BertTokenizerFast.from_pretrained(TOKENIZER_PATH)
31
+
32
+ # Generate caption
33
+ caption = generate_caption(model, image, tokenizer)
34
+ return caption
35
+
36
+ except Exception as e:
37
+ return f"Error occurred: {str(e)}"
38
+
39
+ # Create Gradio interface
40
+ def create_interface():
41
+ available_models = list_available_models()
42
+ if not available_models:
43
+ return gr.Interface(
44
+ fn=lambda x: "No models found in ./models directory. Please train a model first.",
45
+ inputs="image",
46
+ outputs="text",
47
+ title="TeLVE - Turkish efficient Language Vision Engine",
48
+ description="Error: No models available"
49
+ )
50
+
51
+ interface = gr.Interface(
52
+ fn=generate_description,
53
+ inputs=[
54
+ gr.Image(type="filepath", label="Upload Image"),
55
+ gr.Dropdown(choices=available_models, label="Select Model", value=available_models[0])
56
+ ],
57
+ outputs=gr.Textbox(label="Generated Caption"),
58
+ title="TeLVE - Turkish Language Vision Encoder",
59
+ description="Upload an image to generate a Turkish caption.",
60
+ examples=[
61
+ ["./images/example1.jpg", available_models[0]],
62
+ ["./images/example2.jpg", available_models[0]]
63
+ ] if os.path.exists("./images") else None
64
+ )
65
+ return interface
66
+
67
+ if __name__ == "__main__":
68
+ demo = create_interface()
69
+ demo.launch(share=True, server_name="0.0.0.0")
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+
2
+ torch>=1.9.0
3
+ torchvision>=0.10.0
4
+ transformers>=4.11.0
5
+ gradio>=3.0.0
6
+ pandas>=1.3.0
7
+ Pillow>=8.0.0
8
+ tqdm>=4.62.0