Spaces:
Runtime error
Runtime error
File size: 11,849 Bytes
b66940e dfb120d a66fd49 dfb120d 8e647b3 dfb120d b66940e 38e4093 b66940e 41e80e1 dfb120d a66fd49 b66940e 8e647b3 38e4093 8e647b3 e901714 b66940e 8e647b3 b66940e a66fd49 7211a24 8e647b3 b66940e a66fd49 38e4093 b66940e a66fd49 38e4093 8e647b3 38e4093 b66940e 41e80e1 b66940e 41e80e1 b66940e 41e80e1 b66940e 41e80e1 b66940e a66fd49 8e647b3 b66940e 41e80e1 b66940e 8e647b3 b66940e 8e647b3 b66940e a66fd49 38e4093 8e647b3 b66940e a66fd49 b66940e 8e647b3 b66940e 8e647b3 b66940e 7211a24 a66fd49 7211a24 a66fd49 7211a24 a66fd49 38e4093 b66940e 38e4093 7211a24 38e4093 7211a24 b66940e 38e4093 b66940e 38e4093 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 |
# ------------------------------
# AI Multi‑Modal Assistant — Phase 2 (OCR Added)
# AI Multi‑Modal Assistant — Enhanced Version
# ------------------------------
import gradio as gr
from transformers import pipeline, pipeline as hf_pipeline, Wav2Vec2Processor, TFWav2Vec2ForCTC
from PIL import Image
import torch
from torchvision import models, transforms
import pandas as pd
from reportlab.lib.pagesizes import letter
from reportlab.pdfgen import canvas
import io
from textwrap import wrap
import yake
import tempfile
import yake # keyword extraction
import tempfile
from fpdf import FPDF
import time
import tensorflow as tf
import soundfile as sf
from langdetect import detect, DetectorFactory
DetectorFactory.seed = 0 # Consistent language detection
import os
import subprocess
# Ensure tesseract is available at runtime
try:
subprocess.run(["tesseract", "--version"], check=True, stdout=subprocess.PIPE)
except (subprocess.CalledProcessError, FileNotFoundError):
print("Installing tesseract runtime...")
os.system("apt-get update -y && apt-get install -y tesseract-ocr")
import pytesseract # <-- OCR
# ------------------------------
# 1. Load Models & Labels
# ------------------------------
# NLP pipelines
sentiment_model = pipeline(
"sentiment-analysis",
model="distilbert/distilbert-base-uncased-finetuned-sst-2-english",
)
# Summarization Model
summarizer_model = pipeline("summarization", model="facebook/bart-large-cnn")
# Image classification model
image_model = models.resnet50(pretrained=True)
image_model.eval()
preprocess = transforms.Compose(
[
transforms.Resize(256),
transforms.CenterCrop(224),
transforms.ToTensor(),
transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
]
)
# Load ImageNet class labels
with open("imagenet_classes.txt", "r") as f: # ensure this file is in your folder
imagenet_labels = [s.strip() for s in f.readlines()]
# Keyword extraction
kw_extractor = yake.KeywordExtractor(lan="en", top=5)
# ------------------------------
# 2. Helper Functions
# ------------------------------
def analyze_text(text: str) -> dict:
sentiment = sentiment_model(text)[0]
summary = summarizer_model(
text, max_length=min(len(text.split()) + 10, 50), min_length=5
)[0]["summary_text"]
keywords = [kw for kw, score in kw_extractor.extract_keywords(text)]
return {
"Sentiment": sentiment["label"],
"Sentiment Score": round(sentiment["score"], 3),
"Summary": summary,
"Keywords": keywords,
}
def analyze_image(image: Image.Image) -> dict:
img_t = preprocess(image).unsqueeze(0)
with torch.no_grad():
outputs = image_model(img_t)
class_idx = outputs.argmax().item()
class_label = imagenet_labels[class_idx] if 0 <= class_idx < len(imagenet_labels) else f"Class index {class_idx}"
return { "Predicted Class Index": class_idx, "Predicted Class Label": class_label }
def ocr_image(image: Image.Image) -> dict:
"""Extract text from uploaded image using Tesseract OCR."""
text = pytesseract.image_to_string(image)
return {"Extracted Text": text}
if 0 <= class_idx < len(imagenet_labels):
class_label = imagenet_labels[class_idx]
else:
class_label = f"Class index {class_idx}"
return { "Predicted Class Index": class_idx, "Predicted Class Label": class_label }
def generate_pdf(results: dict) -> str:
buffer = io.BytesIO()
c = canvas.Canvas(buffer, pagesize=letter)
width, height = letter
c.setFont("Helvetica", 12)
c.drawString(50, height - 50, "AI Multi-Modal Assistant Report")
y = height -80 # Starting y position
for key, value in results.items():
# Convert any complex types to string
if isinstance(value, (list, dict)):
value = str(value)
# Wrap long lines (max 90 chars per line)
wrapped_lines = wrap(f"{key}: {value}", width=90)
for line in wrapped_lines:
c.drawString(50, y, line)
y -= 18
if y < 60: # New page if needed
c.showPage()
c.setFont("Helvetica", 12)
y = height - 50
y -= 10 # Space between entries
c.save()
buffer.seek(0)
with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp:
tmp.write(buffer.getvalue())
tmp_path = tmp.name
return tmp_path
# ------------------------------
# 3. Multi‑Modal Analysis Function
# ------------------------------
def analyze(input_data):
# Handles both text and image correctly in Gradio
if isinstance(input_data, str) and input_data.strip():
return analyze_text(input_data)
elif isinstance(input_data, dict) and "image" in input_data:
return analyze_image(input_data["image"])
elif isinstance(input_data, Image.Image):
return analyze_image(input_data)
else:
return {"Error": "Please enter text or upload an image."}
# ------------------------------
# 4. Gradio UI Layout
# ------------------------------
with gr.Blocks() as demo:
gr.Markdown("## AI Multi‑Modal Assistant")
# ------------------ Image Analysis Tab ------------------
with gr.Tab("Image Analysis"):
image_input = gr.Image(type="pil", label="Upload an image for classification")
analyze_image_button = gr.Button("Analyze Image") # ✅ MOVED INSIDE the tab
image_output = gr.JSON(label="Image Analysis Results")
pdf_button_image = gr.Button("Download Report (PDF)")
analyze_image_button.click(fn=analyze, inputs=image_input, outputs=image_output)
pdf_button_image.click(
fn=lambda x: generate_pdf(analyze(x)),
inputs=image_input,
outputs=gr.File(label="Download PDF Report"),
)
with gr.Tab("Text Analysis"):
text_input = gr.Textbox(
label="Enter text to analyze",
placeholder="Type your text here...",
lines=5
)
analyze_text_button = gr.Button("Analyze Text")
text_output = gr.JSON(label="Text Analysis Results")
pdf_button_text = gr.Button("Download Report (PDF)")
# Text analysis events
analyze_text_button.click(
fn=analyze,
inputs=text_input,
outputs=text_output
)
pdf_button_text.click(
fn=lambda x: generate_pdf(analyze(x)),
inputs=text_input,
outputs=gr.File(label="Download PDF Report")
)
# ------------------ OCR Tab ------------------
with gr.Tab("OCR"):
ocr_input = gr.Image(type="pil", label="Upload image for OCR")
ocr_button = gr.Button("Extract Text (OCR)")
ocr_output = gr.JSON(label="OCR Results")
pdf_button_ocr = gr.Button("Download OCR PDF")
ocr_button.click(fn=ocr_image, inputs=ocr_input, outputs=ocr_output)
pdf_button_ocr.click(
fn=lambda x: generate_pdf(x),
inputs=ocr_output,
outputs=gr.File(label="Download PDF Report"),
)
# --------------------------------
# Enhanced Tab: Voice Input (Speech-to-Text + Translation + Language Detection)
# ------------------------------
# ------------------------------
# PDF Generator with Unicode support
# ------------------------------
# ------------------------------
# Auto-download DejaVuSans.ttf for all Unicode scripts
# ------------------------------
FONT_PATH = "DejaVuSans.ttf"
if not os.path.exists(FONT_PATH):
import requests
url = "https://github.com/dejavu-fonts/dejavu-fonts/raw/master/ttf/DejaVuSans.ttf"
r = requests.get(url)
with open(FONT_PATH, "wb") as f:
f.write(r.content)
# ------------------------------
# PDF generator with full Unicode support
# ------------------------------
def generate_pdf(content_dict, output_file=None):
pdf = FPDF()
pdf.add_page()
pdf.add_font("DejaVu", "", FONT_PATH, uni=True)
pdf.set_font("DejaVu", size=12)
for key, value in content_dict.items():
pdf.set_font("DejaVu", "B", 12)
pdf.cell(0, 10, f"{key}:", ln=True)
pdf.set_font("DejaVu", "", 12)
pdf.multi_cell(0, 10, value)
pdf.ln(5)
if not output_file:
output_file = f"transcript_{int(time.time())}.pdf"
pdf.output(output_file)
return output_file
# ------------------------------
# TF-based STT (Wav2Vec2 for multilingual)
# ------------------------------
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-xlsr-53")
stt_model = TFWav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-xlsr-53")
def transcribe_audio(file_path):
speech, _ = sf.read(file_path)
input_values = processor(speech, return_tensors="tf", sampling_rate=16000).input_values
logits = stt_model(input_values).logits
predicted_ids = tf.argmax(logits, axis=-1)
transcription = processor.batch_decode(predicted_ids)[0]
return transcription
# ------------------------------
# Language detection
# ------------------------------
def detect_language(text):
try:
lang_code = detect(text)
# Convert language code to full language name (for PDF)
import pycountry
lang = pycountry.languages.get(alpha_2=lang_code)
return lang.name if lang else lang_code
except:
return "unknown"
# ------------------------------
# TensorFlow translation (multilingual -> English)
# ------------------------------
model_name = "Helsinki-NLP/opus-mt-mul-en"
translator = pipeline("translation", model=model_name, framework="tf")
# ------------------------------
# Gradio App
# ------------------------------
with gr.Blocks() as demo:
with gr.Tab("Voice Input"):
gr.Markdown("### 🎤 Multilingual Speech-to-Text + Translation + PDF")
audio_input = gr.Audio(sources=["microphone", "upload"], type="filepath", label="🎙️ Record or upload an audio file")
transcribe_button = gr.Button("Transcribe, Detect & Translate")
detected_lang_output = gr.Textbox(label="🌐 Detected Language", lines=1)
transcription_output = gr.Textbox(label="🗣️ Original Transcription", lines=3)
translation_output = gr.Textbox(label="🌍 Translated to English", lines=3)
pdf_button_audio = gr.Button("Download Transcript (PDF)")
def transcribe_detect_translate(audio_file):
if not audio_file:
return "", "Please record or upload an audio file.", ""
# Step 1: Transcription
original_text = transcribe_audio(audio_file)
if not original_text.strip():
return "", "No speech detected.", ""
# Step 2: Language detection
detected_lang = detect_language(original_text)
# Step 3: Translate if not English
translated_text = (
translator(original_text)[0]["translation_text"]
if detected_lang.lower() != "english"
else original_text
)
return detected_lang, original_text, translated_text
transcribe_button.click(
fn=transcribe_detect_translate,
inputs=audio_input,
outputs=[detected_lang_output, transcription_output, translation_output]
)
pdf_button_audio.click(
fn=lambda lang, orig, trans: generate_pdf({
"Detected Language": lang,
"Original Transcription": orig,
"Translated to English": trans
}),
inputs=[detected_lang_output, transcription_output, translation_output],
outputs=gr.File(label="Download PDF Report")
)
demo.launch()
|