import gradio as gr import torch import numpy as np import struct import lzma import json from huggingface_hub import hf_hub_download from transformers import T5Config, T5ForConditionalGeneration, AutoTokenizer # Download quantized model model_path = hf_hub_download(repo_id="ag14850/Mosquito", filename="mosquito_tiny.bin.xz") def unpack_nbits(data, bits, count): if bits == 8: return np.frombuffer(data, dtype=np.uint8)[:count] result = [] if bits == 4: for byte in data: result.append((byte >> 4) & 0x0F) result.append(byte & 0x0F) elif bits == 6: for i in range(0, len(data), 3): if i + 2 >= len(data): break b0, b1, b2 = data[i], data[i+1], data[i+2] result.append((b0 >> 2) & 0x3F) result.append(((b0 & 0x03) << 4) | ((b1 >> 4) & 0x0F)) result.append(((b1 & 0x0F) << 2) | ((b2 >> 6) & 0x03)) result.append(b2 & 0x3F) elif bits == 5: for i in range(0, len(data), 5): if i + 4 >= len(data): break packed = int.from_bytes(data[i:i+5], 'little') for j in range(8): result.append((packed >> (j * 5)) & 0x1F) elif bits == 7: for i in range(0, len(data), 7): if i + 6 >= len(data): break packed = int.from_bytes(data[i:i+7], 'little') for j in range(8): result.append((packed >> (j * 7)) & 0x7F) return np.array(result[:count], dtype=np.uint8) def load_quantized_model(path): with lzma.open(path, 'rb') as f: data = f.read() offset = 0 version, default_bits, num_params = struct.unpack_from('