danicor commited on
Commit
134d4a1
·
verified ·
1 Parent(s): 191f829

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +444 -308
app.py CHANGED
@@ -133,7 +133,7 @@ class DatabaseManager:
133
  with sqlite3.connect(self.db_path) as conn:
134
  cursor = conn.cursor()
135
 
136
- # Cache table
137
  cursor.execute('''
138
  CREATE TABLE IF NOT EXISTS cache (
139
  id INTEGER PRIMARY KEY AUTOINCREMENT,
@@ -141,12 +141,20 @@ class DatabaseManager:
141
  filename TEXT,
142
  file_size INTEGER,
143
  transcription TEXT,
 
144
  language TEXT,
145
  created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
146
  last_accessed TIMESTAMP DEFAULT CURRENT_TIMESTAMP
147
  )
148
  ''')
149
 
 
 
 
 
 
 
 
150
  # Processing status table
151
  cursor.execute('''
152
  CREATE TABLE IF NOT EXISTS processing_status (
@@ -162,24 +170,32 @@ class DatabaseManager:
162
  )
163
  ''')
164
 
165
- # Translation cache table
166
  cursor.execute('''
167
  CREATE TABLE IF NOT EXISTS translation_cache (
168
  id INTEGER PRIMARY KEY AUTOINCREMENT,
169
  text_hash TEXT,
 
170
  target_language TEXT,
171
  translated_text TEXT,
172
  created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
173
  last_accessed TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
174
- UNIQUE(text_hash, target_language)
175
  )
176
  ''')
177
 
 
 
 
 
 
 
 
178
  # Create indexes for better performance
179
  cursor.execute('CREATE INDEX IF NOT EXISTS idx_cache_hash ON cache(file_hash)')
180
  cursor.execute('CREATE INDEX IF NOT EXISTS idx_cache_created ON cache(created_at)')
181
  cursor.execute('CREATE INDEX IF NOT EXISTS idx_status_hash ON processing_status(file_hash)')
182
- cursor.execute('CREATE INDEX IF NOT EXISTS idx_translation_hash ON translation_cache(text_hash, target_language)')
183
 
184
  conn.commit()
185
 
@@ -203,13 +219,13 @@ def calculate_text_hash(text: str) -> str:
203
  """Calculate hash for text"""
204
  return hashlib.md5(text.encode('utf-8')).hexdigest()
205
 
206
- async def get_from_cache(file_hash: str) -> Optional[str]:
207
- """Get transcription from cache"""
208
  try:
209
  with db_manager.get_connection() as conn:
210
  cursor = conn.cursor()
211
  cursor.execute(
212
- 'SELECT transcription FROM cache WHERE file_hash = ?',
213
  (file_hash,)
214
  )
215
  result = cursor.fetchone()
@@ -220,29 +236,63 @@ async def get_from_cache(file_hash: str) -> Optional[str]:
220
  (file_hash,)
221
  )
222
  conn.commit()
223
- return result[0]
 
 
 
 
 
 
 
 
 
 
 
224
  return None
225
  except Exception as e:
226
  logger.error(f"Error getting from cache: {e}")
227
  return None
228
 
229
- async def get_translation_from_cache(text_hash: str, target_language: str) -> Optional[str]:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
230
  """Get translation from cache"""
231
  try:
232
  with db_manager.get_connection() as conn:
233
  cursor = conn.cursor()
234
  cursor.execute(
235
  '''SELECT translated_text FROM translation_cache
236
- WHERE text_hash = ? AND target_language = ?''',
237
- (text_hash, target_language)
238
  )
239
  result = cursor.fetchone()
240
 
241
  if result:
242
  cursor.execute(
243
  '''UPDATE translation_cache SET last_accessed = CURRENT_TIMESTAMP
244
- WHERE text_hash = ? AND target_language = ?''',
245
- (text_hash, target_language)
246
  )
247
  conn.commit()
248
  return result[0]
@@ -251,31 +301,31 @@ async def get_translation_from_cache(text_hash: str, target_language: str) -> Op
251
  logger.error(f"Error getting translation from cache: {e}")
252
  return None
253
 
254
- async def save_to_cache(file_hash: str, filename: str, file_size: int, transcription: str, language: str = None):
255
- """Save transcription to cache"""
256
  try:
257
  with db_manager.get_connection() as conn:
258
  cursor = conn.cursor()
259
  cursor.execute(
260
  '''INSERT OR REPLACE INTO cache
261
- (file_hash, filename, file_size, transcription, language)
262
- VALUES (?, ?, ?, ?, ?)''',
263
- (file_hash, filename, file_size, transcription, language)
264
  )
265
  conn.commit()
266
  except Exception as e:
267
  logger.error(f"Error saving to cache: {e}")
268
 
269
- async def save_translation_to_cache(text_hash: str, target_language: str, translated_text: str):
270
  """Save translation to cache"""
271
  try:
272
  with db_manager.get_connection() as conn:
273
  cursor = conn.cursor()
274
  cursor.execute(
275
  '''INSERT OR REPLACE INTO translation_cache
276
- (text_hash, target_language, translated_text)
277
- VALUES (?, ?, ?)''',
278
- (text_hash, target_language, translated_text)
279
  )
280
  conn.commit()
281
  except Exception as e:
@@ -368,13 +418,13 @@ def estimate_processing_time(file_size_mb: float) -> int:
368
  return max(1, int(estimated_seconds / 60))
369
 
370
  async def background_transcription(file_path: str, file_hash: str, filename: str, file_size: int, translate_to_english: bool = False):
371
- """Background task for transcription"""
372
  try:
373
  logger.info(f"Starting background transcription for {filename}")
374
 
375
  await update_processing_status(file_hash, status='processing', progress=10)
376
 
377
- # Transcribe audio
378
  result = whisper_model.transcribe(
379
  file_path,
380
  fp16=(device != "cpu"),
@@ -384,7 +434,7 @@ async def background_transcription(file_path: str, file_hash: str, filename: str
384
  word_timestamps=False
385
  )
386
 
387
- await update_processing_status(file_hash, progress=60)
388
 
389
  text = result["text"].strip() or "No text detected"
390
  detected_language = result.get("language", "unknown")
@@ -395,27 +445,39 @@ async def background_transcription(file_path: str, file_hash: str, filename: str
395
  "from_cache": False
396
  }
397
 
398
- # Translate if requested and needed
399
- if translate_to_english and detected_language != "en":
400
- await update_processing_status(file_hash, progress=80)
 
401
 
 
402
  english_result = whisper_model.transcribe(
403
  file_path,
404
  fp16=(device != "cpu"),
405
  language=None,
406
- task="translate",
407
  verbose=False,
408
  word_timestamps=False
409
  )
410
 
411
- english_text = english_result["text"].strip()
412
- if english_text:
413
- response_data["english_text"] = english_text
 
 
 
 
 
 
 
 
414
 
415
- # Save to cache
 
 
416
  await save_to_cache(
417
  file_hash, filename, file_size,
418
- json.dumps(response_data), detected_language
419
  )
420
 
421
  await update_processing_status(file_hash, status='completed', progress=100)
@@ -433,236 +495,41 @@ async def background_transcription(file_path: str, file_hash: str, filename: str
433
  except Exception as e:
434
  logger.error(f"Error deleting temp file: {e}")
435
 
436
- def split_text_smartly(text: str, max_tokens: int = 400) -> list:
437
- """Improved text splitting that handles various languages and formats"""
438
- # First try to split by sentences (multiple patterns for different languages)
439
- sentence_patterns = [
440
- r'(?<=[.!?])\s+', # English
441
- r'(?<=[。!?])\s*', # Chinese/Japanese
442
- r'(?<=[۔؟!])\s+', # Persian/Arabic
443
- r'(?<=[\.!?])\s+' # Fallback
444
- ]
445
-
446
- sentences = []
447
- remaining_text = text
448
-
449
- for pattern in sentence_patterns:
450
- try:
451
- potential_sentences = re.split(pattern, remaining_text)
452
- if len(potential_sentences) > 1:
453
- sentences = potential_sentences
454
- break
455
- except:
456
- continue
457
-
458
- # If no sentence splitting worked, split by length
459
- if not sentences or len(sentences) == 1:
460
- chunk_size = 200 # Conservative chunk size
461
- sentences = [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)]
462
-
463
- # Group sentences into chunks that fit token limit
464
- chunks = []
465
- current_chunk = []
466
- current_length = 0
467
-
468
- for sentence in sentences:
469
- if not sentence.strip():
470
- continue
471
-
472
- try:
473
- sentence_tokens = len(translation_tokenizer.tokenize(sentence))
474
- except:
475
- # Estimate tokens if tokenizer fails
476
- sentence_tokens = len(sentence.split()) * 1.3
477
-
478
- if current_length + sentence_tokens > max_tokens and current_chunk:
479
- chunks.append(' '.join(current_chunk).strip())
480
- current_chunk = [sentence]
481
- current_length = sentence_tokens
482
- else:
483
- current_chunk.append(sentence)
484
- current_length += sentence_tokens
485
-
486
- if current_chunk:
487
- chunks.append(' '.join(current_chunk).strip())
488
-
489
- # Remove empty chunks
490
- chunks = [chunk for chunk in chunks if chunk.strip()]
491
-
492
- return chunks if chunks else [text]
493
-
494
- def translate_text_chunk(text: str, target_code: str, max_retries: int = 3) -> str:
495
- """Improved translation with retry logic and better error handling"""
496
- if not text.strip():
497
- return ""
498
-
499
- for attempt in range(max_retries):
500
- try:
501
- # Use longer max_length for better translation quality
502
- max_length = min(1024, len(text) * 2) # Dynamic max length
503
-
504
- inputs = translation_tokenizer(
505
- text,
506
- return_tensors="pt",
507
- padding=True,
508
- truncation=True,
509
- max_length=512 # Input limit
510
- )
511
-
512
- if device == "cuda":
513
- inputs = {k: v.to(device) for k, v in inputs.items()}
514
-
515
- # Generate translation with better parameters
516
- translated_tokens = translation_model.generate(
517
- **inputs,
518
- forced_bos_token_id=translation_tokenizer.lang_code_to_id[target_code],
519
- max_length=max_length, # Use dynamic max length
520
- min_length=5, # Ensure minimum output
521
- num_beams=4,
522
- early_stopping=True,
523
- do_sample=False, # Deterministic output
524
- temperature=1.0,
525
- repetition_penalty=1.1
526
- )
527
-
528
- translated_text = translation_tokenizer.batch_decode(
529
- translated_tokens,
530
- skip_special_tokens=True
531
- )[0].strip()
532
-
533
- if translated_text and len(translated_text) > 2:
534
- return translated_text
535
- else:
536
- logger.warning(f"Empty translation on attempt {attempt + 1}")
537
-
538
- except Exception as e:
539
- logger.error(f"Translation attempt {attempt + 1} failed: {e}")
540
- if attempt == max_retries - 1:
541
- return f"[Translation failed: {text[:50]}...]"
542
-
543
- return f"[Translation failed after {max_retries} attempts]"
544
-
545
- async def debug_translate_endpoint(
546
- text: str = Form(..., min_length=1),
547
- target_language: str = Form(...)
548
- ):
549
- """Debug version of translation endpoint with detailed logging"""
550
-
551
- if not translation_model:
552
- raise HTTPException(status_code=503, detail="Translation service not available")
553
-
554
- text = text.strip()
555
- logger.info(f"=== TRANSLATION DEBUG START ===")
556
- logger.info(f"Original text length: {len(text)} characters")
557
- logger.info(f"Original text preview: {text[:200]}...")
558
- logger.info(f"Target language: {target_language}")
559
-
560
- target_language_lower = target_language.lower()
561
- if target_language_lower not in LANGUAGE_CODES:
562
- raise HTTPException(
563
- status_code=400,
564
- detail=f"Unsupported language. Supported: {list(LANGUAGE_CODES.keys())}"
565
- )
566
-
567
- # Check cache
568
- text_hash = calculate_text_hash(text)
569
- cached_translation = await get_translation_from_cache(text_hash, target_language_lower)
570
- if cached_translation:
571
- logger.info("Returning cached translation")
572
- return JSONResponse({
573
- "text": text,
574
- "translated_text": cached_translation,
575
- "target_language": target_language,
576
- "from_cache": True
577
- })
578
-
579
  try:
580
- target_code = LANGUAGE_CODES[target_language_lower]
581
-
582
- # Smart text splitting with debug info
583
- chunks = split_text_smartly(text, max_tokens=350)
584
- logger.info(f"Text split into {len(chunks)} chunks")
585
-
586
- for i, chunk in enumerate(chunks):
587
- logger.info(f"Chunk {i+1}: length={len(chunk)}, preview='{chunk[:100]}...'")
588
-
589
- translated_chunks = []
590
- debug_info = {
591
- "total_chunks": len(chunks),
592
- "successful_chunks": 0,
593
- "failed_chunks": 0,
594
- "chunk_details": []
595
- }
596
-
597
- for i, chunk in enumerate(chunks):
598
- if not chunk.strip():
599
- continue
600
-
601
- chunk_info = {
602
- "chunk_id": i+1,
603
- "original_length": len(chunk),
604
- "original_preview": chunk[:100]
605
- }
606
-
607
- logger.info(f"Processing chunk {i+1}/{len(chunks)}")
608
-
609
- translated_chunk = translate_text_chunk(chunk, target_code)
610
-
611
- chunk_info["translated_preview"] = translated_chunk[:100] if translated_chunk else "FAILED"
612
- chunk_info["translated_length"] = len(translated_chunk) if translated_chunk else 0
613
-
614
- if translated_chunk and not translated_chunk.startswith("[Translation"):
615
- translated_chunks.append(translated_chunk)
616
- debug_info["successful_chunks"] += 1
617
- chunk_info["status"] = "success"
618
- logger.info(f"Chunk {i+1} translated successfully: {len(translated_chunk)} chars")
619
- else:
620
- debug_info["failed_chunks"] += 1
621
- chunk_info["status"] = "failed"
622
- logger.error(f"Chunk {i+1} translation failed: {translated_chunk}")
623
-
624
- debug_info["chunk_details"].append(chunk_info)
625
-
626
- if not translated_chunks:
627
- logger.error("All translation chunks failed!")
628
- raise HTTPException(status_code=500, detail="Translation failed for all text chunks")
629
-
630
- # Combine translated chunks
631
- translated_text = ' '.join(translated_chunks)
632
- logger.info(f"Combined translation length: {len(translated_text)} characters")
633
- logger.info(f"Translation preview: {translated_text[:200]}...")
634
-
635
- # Clean up the translation
636
- original_length = len(translated_text)
637
- translated_text = re.sub(r'\s+', ' ', translated_text).strip()
638
- logger.info(f"After cleanup: {len(translated_text)} characters (was {original_length})")
639
-
640
- # Quality check
641
- translation_ratio = len(translated_text) / len(text) if len(text) > 0 else 0
642
- logger.info(f"Translation ratio: {translation_ratio:.2f} (translated/original)")
643
 
644
- if translation_ratio < 0.1:
645
- logger.warning(f"Translation seems too short! Ratio: {translation_ratio}")
646
 
647
- # Save to cache
648
- await save_translation_to_cache(text_hash, target_language_lower, translated_text)
 
 
 
 
 
 
649
 
650
- logger.info("=== TRANSLATION DEBUG END ===")
 
 
 
 
651
 
652
- return JSONResponse({
653
- "text": text,
654
- "translated_text": translated_text,
655
- "target_language": target_language,
656
- "from_cache": False,
657
- "debug_info": debug_info,
658
- "translation_ratio": translation_ratio
659
- })
660
 
661
- except HTTPException:
662
- raise
663
  except Exception as e:
664
- logger.error(f"Translation error: {e}")
665
- raise HTTPException(status_code=500, detail=f"Translation failed: {str(e)}")
666
 
667
  @app.get("/")
668
  async def root():
@@ -679,12 +546,17 @@ async def root():
679
 
680
  cursor.execute('SELECT COUNT(*) FROM translation_cache')
681
  translation_cache_count = cursor.fetchone()[0] or 0
 
 
 
 
682
 
683
  return {
684
  "message": "Whisper API with Translation is running",
685
  "device": device,
686
  "cuda_available": torch.cuda.is_available(),
687
  "cached_files": cache_count,
 
688
  "translation_cache": translation_cache_count,
689
  "currently_processing": processing_count,
690
  "translation_available": translation_model is not None,
@@ -701,7 +573,7 @@ async def transcribe_audio(
701
  translate_to_english: bool = False,
702
  language: Optional[str] = Query(None, description="Specify language code for better accuracy")
703
  ):
704
- """Transcribe audio file to text"""
705
  tmp_file_path = None
706
 
707
  try:
@@ -733,7 +605,7 @@ async def transcribe_audio(
733
  if cached_result:
734
  logger.info("Cache hit - returning cached result")
735
  await remove_processing_status(file_hash)
736
- return JSONResponse(json.loads(cached_result))
737
 
738
  # Check if already processing
739
  processing_status = await get_processing_status(file_hash)
@@ -783,18 +655,28 @@ async def transcribe_audio(
783
  "from_cache": False
784
  }
785
 
786
- # Translate if requested
787
- if translate_to_english and detected_language != "en":
 
 
788
  transcribe_args['task'] = "translate"
789
  english_result = whisper_model.transcribe(tmp_file_path, **transcribe_args)
790
- english_text = english_result["text"].strip()
791
- if english_text:
792
- response_data["english_text"] = english_text
 
 
 
 
 
 
 
 
793
 
794
- # Save to cache
795
  await save_to_cache(
796
  file_hash, file.filename, file_size,
797
- json.dumps(response_data), detected_language
798
  )
799
 
800
  return JSONResponse(response_data)
@@ -836,9 +718,10 @@ async def transcribe_audio(
836
  @app.post("/translate")
837
  async def translate_endpoint(
838
  text: str = Form(..., min_length=1),
839
- target_language: str = Form(...)
 
840
  ):
841
- """Improved translation endpoint with better chunking and error handling"""
842
 
843
  if not translation_model:
844
  raise HTTPException(status_code=503, detail="Translation service not available")
@@ -854,76 +737,272 @@ async def translate_endpoint(
854
  detail=f"Unsupported language. Supported: {list(LANGUAGE_CODES.keys())}"
855
  )
856
 
857
- # Check cache
858
- text_hash = calculate_text_hash(text)
859
- cached_translation = await get_translation_from_cache(text_hash, target_language_lower)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
860
  if cached_translation:
861
  return JSONResponse({
862
  "text": text,
863
  "translated_text": cached_translation,
864
  "target_language": target_language,
865
- "from_cache": True
 
866
  })
867
 
 
868
  try:
869
  target_code = LANGUAGE_CODES[target_language_lower]
870
 
871
- # Smart text splitting
872
- chunks = split_text_smartly(text, max_tokens=350) # Conservative token limit
873
- logger.info(f"Split text into {len(chunks)} chunks for translation")
874
 
875
- translated_chunks = []
876
 
877
- for i, chunk in enumerate(chunks):
878
- if not chunk.strip():
879
  continue
880
 
881
- logger.info(f"Translating chunk {i+1}/{len(chunks)}: '{chunk[:100]}...'")
882
-
883
- translated_chunk = translate_text_chunk(chunk, target_code)
884
 
885
- if translated_chunk and not translated_chunk.startswith("[Translation"):
886
- translated_chunks.append(translated_chunk)
887
- else:
888
- logger.error(f"Failed to translate chunk {i+1}: {chunk[:50]}...")
889
- # Try to translate smaller pieces of the failed chunk
890
- smaller_chunks = split_text_smartly(chunk, max_tokens=200)
891
- for small_chunk in smaller_chunks:
892
- small_translation = translate_text_chunk(small_chunk, target_code)
893
- if small_translation and not small_translation.startswith("[Translation"):
894
- translated_chunks.append(small_translation)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
895
 
896
- if not translated_chunks:
897
- raise HTTPException(status_code=500, detail="Translation failed for all text chunks")
898
 
899
- # Combine translated chunks with proper spacing
900
- translated_text = ' '.join(translated_chunks)
901
 
902
- # Clean up the translation
903
  translated_text = re.sub(r'\s+', ' ', translated_text).strip()
904
- translated_text = re.sub(r'([.!?])\s*([.!?])', r'\1 \2', translated_text) # Fix punctuation
905
-
906
- if len(translated_text) < len(text) * 0.1: # Sanity check
907
- logger.warning("Translation seems too short compared to original")
908
 
909
  # Save to cache
910
- await save_translation_to_cache(text_hash, target_language_lower, translated_text)
911
 
912
  return JSONResponse({
913
  "text": text,
914
  "translated_text": translated_text,
915
  "target_language": target_language,
916
  "from_cache": False,
917
- "chunks_processed": len(chunks),
918
- "chunks_translated": len(translated_chunks)
919
  })
920
 
921
- except HTTPException:
922
- raise
923
  except Exception as e:
924
  logger.error(f"Translation error: {e}")
925
  raise HTTPException(status_code=500, detail=f"Translation failed: {str(e)}")
926
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
927
  @app.get("/languages")
928
  async def get_supported_languages():
929
  """Get list of supported languages for translation"""
@@ -940,13 +1019,12 @@ async def check_status(file_hash: str):
940
  cached_result = await get_from_cache(file_hash)
941
  if cached_result:
942
  await remove_processing_status(file_hash)
943
- cached_data = json.loads(cached_result)
944
- cached_data.update({
945
  "status": "completed",
946
  "from_cache": True,
947
  "message": "Processing completed and result is ready"
948
  })
949
- return JSONResponse(cached_data)
950
 
951
  # Check processing status
952
  processing_status = await get_processing_status(file_hash)
@@ -978,6 +1056,64 @@ async def health_check():
978
  "translation_loaded": translation_model is not None
979
  }
980
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
981
  if __name__ == "__main__":
982
  uvicorn.run(
983
  app,
 
133
  with sqlite3.connect(self.db_path) as conn:
134
  cursor = conn.cursor()
135
 
136
+ # Cache table - now includes english_reference
137
  cursor.execute('''
138
  CREATE TABLE IF NOT EXISTS cache (
139
  id INTEGER PRIMARY KEY AUTOINCREMENT,
 
141
  filename TEXT,
142
  file_size INTEGER,
143
  transcription TEXT,
144
+ english_reference TEXT,
145
  language TEXT,
146
  created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
147
  last_accessed TIMESTAMP DEFAULT CURRENT_TIMESTAMP
148
  )
149
  ''')
150
 
151
+ # Add english_reference column if it doesn't exist (for existing databases)
152
+ cursor.execute("PRAGMA table_info(cache)")
153
+ columns = [column[1] for column in cursor.fetchall()]
154
+ if 'english_reference' not in columns:
155
+ cursor.execute('ALTER TABLE cache ADD COLUMN english_reference TEXT')
156
+ logger.info("Added english_reference column to cache table")
157
+
158
  # Processing status table
159
  cursor.execute('''
160
  CREATE TABLE IF NOT EXISTS processing_status (
 
170
  )
171
  ''')
172
 
173
+ # Translation cache table - now includes source_language
174
  cursor.execute('''
175
  CREATE TABLE IF NOT EXISTS translation_cache (
176
  id INTEGER PRIMARY KEY AUTOINCREMENT,
177
  text_hash TEXT,
178
+ source_language TEXT DEFAULT 'english',
179
  target_language TEXT,
180
  translated_text TEXT,
181
  created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
182
  last_accessed TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
183
+ UNIQUE(text_hash, source_language, target_language)
184
  )
185
  ''')
186
 
187
+ # Add source_language column if it doesn't exist (for existing databases)
188
+ cursor.execute("PRAGMA table_info(translation_cache)")
189
+ columns = [column[1] for column in cursor.fetchall()]
190
+ if 'source_language' not in columns:
191
+ cursor.execute('ALTER TABLE translation_cache ADD COLUMN source_language TEXT DEFAULT "english"')
192
+ logger.info("Added source_language column to translation_cache table")
193
+
194
  # Create indexes for better performance
195
  cursor.execute('CREATE INDEX IF NOT EXISTS idx_cache_hash ON cache(file_hash)')
196
  cursor.execute('CREATE INDEX IF NOT EXISTS idx_cache_created ON cache(created_at)')
197
  cursor.execute('CREATE INDEX IF NOT EXISTS idx_status_hash ON processing_status(file_hash)')
198
+ cursor.execute('CREATE INDEX IF NOT EXISTS idx_translation_hash ON translation_cache(text_hash, source_language, target_language)')
199
 
200
  conn.commit()
201
 
 
219
  """Calculate hash for text"""
220
  return hashlib.md5(text.encode('utf-8')).hexdigest()
221
 
222
+ async def get_from_cache(file_hash: str) -> Optional[Dict[str, Any]]:
223
+ """Get transcription from cache - now returns both original and english reference"""
224
  try:
225
  with db_manager.get_connection() as conn:
226
  cursor = conn.cursor()
227
  cursor.execute(
228
+ 'SELECT transcription, english_reference FROM cache WHERE file_hash = ?',
229
  (file_hash,)
230
  )
231
  result = cursor.fetchone()
 
236
  (file_hash,)
237
  )
238
  conn.commit()
239
+
240
+ # Parse the cached transcription data
241
+ try:
242
+ transcription_data = json.loads(result[0]) if result[0] else {}
243
+ except:
244
+ transcription_data = {"text": result[0] or ""}
245
+
246
+ # Add english reference if available
247
+ if result[1]:
248
+ transcription_data["english_reference"] = result[1]
249
+
250
+ return transcription_data
251
  return None
252
  except Exception as e:
253
  logger.error(f"Error getting from cache: {e}")
254
  return None
255
 
256
+ async def get_english_reference_from_cache(file_hash: str) -> Optional[str]:
257
+ """Get English reference text for a cached audio file"""
258
+ try:
259
+ with db_manager.get_connection() as conn:
260
+ cursor = conn.cursor()
261
+ cursor.execute(
262
+ 'SELECT english_reference FROM cache WHERE file_hash = ?',
263
+ (file_hash,)
264
+ )
265
+ result = cursor.fetchone()
266
+
267
+ if result and result[0]:
268
+ cursor.execute(
269
+ 'UPDATE cache SET last_accessed = CURRENT_TIMESTAMP WHERE file_hash = ?',
270
+ (file_hash,)
271
+ )
272
+ conn.commit()
273
+ return result[0]
274
+ return None
275
+ except Exception as e:
276
+ logger.error(f"Error getting english reference from cache: {e}")
277
+ return None
278
+
279
+ async def get_translation_from_cache(text_hash: str, target_language: str, source_language: str = "english") -> Optional[str]:
280
  """Get translation from cache"""
281
  try:
282
  with db_manager.get_connection() as conn:
283
  cursor = conn.cursor()
284
  cursor.execute(
285
  '''SELECT translated_text FROM translation_cache
286
+ WHERE text_hash = ? AND target_language = ? AND source_language = ?''',
287
+ (text_hash, target_language, source_language)
288
  )
289
  result = cursor.fetchone()
290
 
291
  if result:
292
  cursor.execute(
293
  '''UPDATE translation_cache SET last_accessed = CURRENT_TIMESTAMP
294
+ WHERE text_hash = ? AND target_language = ? AND source_language = ?''',
295
+ (text_hash, target_language, source_language)
296
  )
297
  conn.commit()
298
  return result[0]
 
301
  logger.error(f"Error getting translation from cache: {e}")
302
  return None
303
 
304
+ async def save_to_cache(file_hash: str, filename: str, file_size: int, transcription: str, english_reference: str = None, language: str = None):
305
+ """Save transcription to cache - now includes english reference"""
306
  try:
307
  with db_manager.get_connection() as conn:
308
  cursor = conn.cursor()
309
  cursor.execute(
310
  '''INSERT OR REPLACE INTO cache
311
+ (file_hash, filename, file_size, transcription, english_reference, language)
312
+ VALUES (?, ?, ?, ?, ?, ?)''',
313
+ (file_hash, filename, file_size, transcription, english_reference, language)
314
  )
315
  conn.commit()
316
  except Exception as e:
317
  logger.error(f"Error saving to cache: {e}")
318
 
319
+ async def save_translation_to_cache(text_hash: str, target_language: str, translated_text: str, source_language: str = "english"):
320
  """Save translation to cache"""
321
  try:
322
  with db_manager.get_connection() as conn:
323
  cursor = conn.cursor()
324
  cursor.execute(
325
  '''INSERT OR REPLACE INTO translation_cache
326
+ (text_hash, source_language, target_language, translated_text)
327
+ VALUES (?, ?, ?, ?)''',
328
+ (text_hash, source_language, target_language, translated_text)
329
  )
330
  conn.commit()
331
  except Exception as e:
 
418
  return max(1, int(estimated_seconds / 60))
419
 
420
  async def background_transcription(file_path: str, file_hash: str, filename: str, file_size: int, translate_to_english: bool = False):
421
+ """Background task for transcription with English reference caching"""
422
  try:
423
  logger.info(f"Starting background transcription for {filename}")
424
 
425
  await update_processing_status(file_hash, status='processing', progress=10)
426
 
427
+ # Transcribe audio in original language
428
  result = whisper_model.transcribe(
429
  file_path,
430
  fp16=(device != "cpu"),
 
434
  word_timestamps=False
435
  )
436
 
437
+ await update_processing_status(file_hash, progress=40)
438
 
439
  text = result["text"].strip() or "No text detected"
440
  detected_language = result.get("language", "unknown")
 
445
  "from_cache": False
446
  }
447
 
448
+ # Always get English reference if not already English
449
+ english_reference = None
450
+ if detected_language != "en":
451
+ await update_processing_status(file_hash, progress=70)
452
 
453
+ # Get English translation using Whisper's translate task
454
  english_result = whisper_model.transcribe(
455
  file_path,
456
  fp16=(device != "cpu"),
457
  language=None,
458
+ task="translate", # This translates to English
459
  verbose=False,
460
  word_timestamps=False
461
  )
462
 
463
+ english_reference = english_result["text"].strip()
464
+ if english_reference:
465
+ response_data["english_reference"] = english_reference
466
+
467
+ # If user requested English translation, include it
468
+ if translate_to_english:
469
+ response_data["english_text"] = english_reference
470
+ else:
471
+ # If the original is English, use it as reference
472
+ english_reference = text
473
+ response_data["english_reference"] = english_reference
474
 
475
+ await update_processing_status(file_hash, progress=90)
476
+
477
+ # Save to cache with English reference
478
  await save_to_cache(
479
  file_hash, filename, file_size,
480
+ json.dumps(response_data), english_reference, detected_language
481
  )
482
 
483
  await update_processing_status(file_hash, status='completed', progress=100)
 
495
  except Exception as e:
496
  logger.error(f"Error deleting temp file: {e}")
497
 
498
+ def translate_text_chunk(text: str, target_code: str) -> str:
499
+ """Translate a single chunk of text"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
500
  try:
501
+ # Tokenize input
502
+ inputs = translation_tokenizer(
503
+ text,
504
+ return_tensors="pt",
505
+ padding=True,
506
+ truncation=True,
507
+ max_length=512
508
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
509
 
510
+ if device == "cuda":
511
+ inputs = {k: v.to(device) for k, v in inputs.items()}
512
 
513
+ # Generate translation
514
+ translated_tokens = translation_model.generate(
515
+ **inputs,
516
+ forced_bos_token_id=translation_tokenizer.lang_code_to_id[target_code],
517
+ max_length=512,
518
+ num_beams=5,
519
+ early_stopping=True
520
+ )
521
 
522
+ # Decode output
523
+ translated_text = translation_tokenizer.batch_decode(
524
+ translated_tokens,
525
+ skip_special_tokens=True
526
+ )[0].strip()
527
 
528
+ return translated_text
 
 
 
 
 
 
 
529
 
 
 
530
  except Exception as e:
531
+ logger.error(f"Error translating chunk: {e}")
532
+ return f"[Translation error: {str(e)}]"
533
 
534
  @app.get("/")
535
  async def root():
 
546
 
547
  cursor.execute('SELECT COUNT(*) FROM translation_cache')
548
  translation_cache_count = cursor.fetchone()[0] or 0
549
+
550
+ # Count cached English references
551
+ cursor.execute('SELECT COUNT(*) FROM cache WHERE english_reference IS NOT NULL')
552
+ english_reference_count = cursor.fetchone()[0] or 0
553
 
554
  return {
555
  "message": "Whisper API with Translation is running",
556
  "device": device,
557
  "cuda_available": torch.cuda.is_available(),
558
  "cached_files": cache_count,
559
+ "english_references": english_reference_count,
560
  "translation_cache": translation_cache_count,
561
  "currently_processing": processing_count,
562
  "translation_available": translation_model is not None,
 
573
  translate_to_english: bool = False,
574
  language: Optional[str] = Query(None, description="Specify language code for better accuracy")
575
  ):
576
+ """Transcribe audio file to text with English reference caching"""
577
  tmp_file_path = None
578
 
579
  try:
 
605
  if cached_result:
606
  logger.info("Cache hit - returning cached result")
607
  await remove_processing_status(file_hash)
608
+ return JSONResponse(cached_result)
609
 
610
  # Check if already processing
611
  processing_status = await get_processing_status(file_hash)
 
655
  "from_cache": False
656
  }
657
 
658
+ # Always get English reference
659
+ english_reference = None
660
+ if detected_language != "en":
661
+ # Get English translation using Whisper's translate task
662
  transcribe_args['task'] = "translate"
663
  english_result = whisper_model.transcribe(tmp_file_path, **transcribe_args)
664
+ english_reference = english_result["text"].strip()
665
+ if english_reference:
666
+ response_data["english_reference"] = english_reference
667
+
668
+ # If user requested English translation, include it
669
+ if translate_to_english:
670
+ response_data["english_text"] = english_reference
671
+ else:
672
+ # If original is English, use it as reference
673
+ english_reference = text
674
+ response_data["english_reference"] = english_reference
675
 
676
+ # Save to cache with English reference
677
  await save_to_cache(
678
  file_hash, file.filename, file_size,
679
+ json.dumps(response_data), english_reference, detected_language
680
  )
681
 
682
  return JSONResponse(response_data)
 
718
  @app.post("/translate")
719
  async def translate_endpoint(
720
  text: str = Form(..., min_length=1),
721
+ target_language: str = Form(...),
722
+ file_hash: Optional[str] = Form(None, description="Hash of audio file for using English reference")
723
  ):
724
+ """Translate text to target language - preferably using English reference from audio"""
725
 
726
  if not translation_model:
727
  raise HTTPException(status_code=503, detail="Translation service not available")
 
737
  detail=f"Unsupported language. Supported: {list(LANGUAGE_CODES.keys())}"
738
  )
739
 
740
+ # If target language is English, check if we have English reference from audio
741
+ if target_language_lower == 'english' and file_hash:
742
+ english_reference = await get_english_reference_from_cache(file_hash)
743
+ if english_reference:
744
+ return JSONResponse({
745
+ "text": text,
746
+ "translated_text": english_reference,
747
+ "target_language": target_language,
748
+ "from_cache": True,
749
+ "source": "whisper_english_reference"
750
+ })
751
+
752
+ # Determine source text for translation
753
+ source_text = text
754
+ source_language = "unknown"
755
+
756
+ # If we have a file_hash, try to use English reference for better translation
757
+ if file_hash:
758
+ english_reference = await get_english_reference_from_cache(file_hash)
759
+ if english_reference and target_language_lower != 'english':
760
+ source_text = english_reference
761
+ source_language = "english"
762
+ logger.info(f"Using English reference for translation to {target_language}")
763
+
764
+ # Check translation cache
765
+ text_hash = calculate_text_hash(source_text)
766
+ cached_translation = await get_translation_from_cache(text_hash, target_language_lower, source_language)
767
  if cached_translation:
768
  return JSONResponse({
769
  "text": text,
770
  "translated_text": cached_translation,
771
  "target_language": target_language,
772
+ "from_cache": True,
773
+ "source_language": source_language
774
  })
775
 
776
+ # Perform translation
777
  try:
778
  target_code = LANGUAGE_CODES[target_language_lower]
779
 
780
+ # Split text into sentences for better translation
781
+ sentences = re.split(r'(?<=[.!?])\s+', source_text)
782
+ logger.info(f"Split text into {len(sentences)} sentences for translation")
783
 
784
+ translated_sentences = []
785
 
786
+ for i, sentence in enumerate(sentences):
787
+ if not sentence.strip():
788
  continue
789
 
790
+ logger.info(f"Translating sentence {i+1}/{len(sentences)}: '{sentence[:50]}...'")
 
 
791
 
792
+ try:
793
+ # Tokenize input
794
+ inputs = translation_tokenizer(
795
+ sentence,
796
+ return_tensors="pt",
797
+ padding=True,
798
+ truncation=True,
799
+ max_length=512
800
+ )
801
+
802
+ if device == "cuda":
803
+ inputs = {k: v.to(device) for k, v in inputs.items()}
804
+
805
+ # Generate translation
806
+ translated_tokens = translation_model.generate(
807
+ **inputs,
808
+ forced_bos_token_id=translation_tokenizer.lang_code_to_id[target_code],
809
+ max_length=512,
810
+ num_beams=4,
811
+ early_stopping=True
812
+ )
813
+
814
+ # Decode output
815
+ translated_sentence = translation_tokenizer.batch_decode(
816
+ translated_tokens,
817
+ skip_special_tokens=True
818
+ )[0].strip()
819
+
820
+ if translated_sentence:
821
+ translated_sentences.append(translated_sentence)
822
+ else:
823
+ translated_sentences.append(f"[Translation failed for: {sentence}]")
824
+
825
+ except Exception as e:
826
+ logger.error(f"Error translating sentence {i+1}: {e}")
827
+ translated_sentences.append(f"[Translation error: {str(e)}]")
828
 
829
+ # Combine translated sentences
830
+ translated_text = ' '.join(translated_sentences)
831
 
832
+ if not translated_text or translated_text.startswith("[Translation error"):
833
+ raise HTTPException(status_code=500, detail="Translation returned empty result or encountered errors")
834
 
835
+ # Clean up translation
836
  translated_text = re.sub(r'\s+', ' ', translated_text).strip()
 
 
 
 
837
 
838
  # Save to cache
839
+ await save_translation_to_cache(text_hash, target_language_lower, translated_text, source_language)
840
 
841
  return JSONResponse({
842
  "text": text,
843
  "translated_text": translated_text,
844
  "target_language": target_language,
845
  "from_cache": False,
846
+ "source_language": source_language,
847
+ "used_english_reference": bool(file_hash and source_language == "english")
848
  })
849
 
 
 
850
  except Exception as e:
851
  logger.error(f"Translation error: {e}")
852
  raise HTTPException(status_code=500, detail=f"Translation failed: {str(e)}")
853
 
854
+ @app.post("/translate_from_audio")
855
+ async def translate_from_audio_endpoint(
856
+ file_hash: str = Form(...),
857
+ target_language: str = Form(...)
858
+ ):
859
+ """Translate audio content using cached English reference for better results"""
860
+
861
+ if not translation_model:
862
+ raise HTTPException(status_code=503, detail="Translation service not available")
863
+
864
+ target_language_lower = target_language.lower()
865
+ if target_language_lower not in LANGUAGE_CODES:
866
+ raise HTTPException(
867
+ status_code=400,
868
+ detail=f"Unsupported language. Supported: {list(LANGUAGE_CODES.keys())}"
869
+ )
870
+
871
+ # Get English reference from cache
872
+ english_reference = await get_english_reference_from_cache(file_hash)
873
+ if not english_reference:
874
+ raise HTTPException(status_code=404, detail="Audio file not found in cache or no English reference available")
875
+
876
+ # If target is English, return the English reference directly
877
+ if target_language_lower == 'english':
878
+ return JSONResponse({
879
+ "file_hash": file_hash,
880
+ "translated_text": english_reference,
881
+ "target_language": target_language,
882
+ "from_cache": True,
883
+ "source": "whisper_english_reference"
884
+ })
885
+
886
+ # Check translation cache
887
+ text_hash = calculate_text_hash(english_reference)
888
+ cached_translation = await get_translation_from_cache(text_hash, target_language_lower, "english")
889
+ if cached_translation:
890
+ return JSONResponse({
891
+ "file_hash": file_hash,
892
+ "translated_text": cached_translation,
893
+ "target_language": target_language,
894
+ "from_cache": True,
895
+ "source_language": "english"
896
+ })
897
+
898
+ # Perform translation from English reference
899
+ try:
900
+ target_code = LANGUAGE_CODES[target_language_lower]
901
+
902
+ # Split text into sentences for better translation
903
+ sentences = re.split(r'(?<=[.!?])\s+', english_reference)
904
+ logger.info(f"Translating from English reference - {len(sentences)} sentences to {target_language}")
905
+
906
+ translated_sentences = []
907
+
908
+ for i, sentence in enumerate(sentences):
909
+ if not sentence.strip():
910
+ continue
911
+
912
+ try:
913
+ # Tokenize input
914
+ inputs = translation_tokenizer(
915
+ sentence,
916
+ return_tensors="pt",
917
+ padding=True,
918
+ truncation=True,
919
+ max_length=512
920
+ )
921
+
922
+ if device == "cuda":
923
+ inputs = {k: v.to(device) for k, v in inputs.items()}
924
+
925
+ # Generate translation
926
+ translated_tokens = translation_model.generate(
927
+ **inputs,
928
+ forced_bos_token_id=translation_tokenizer.lang_code_to_id[target_code],
929
+ max_length=512,
930
+ num_beams=4,
931
+ early_stopping=True
932
+ )
933
+
934
+ # Decode output
935
+ translated_sentence = translation_tokenizer.batch_decode(
936
+ translated_tokens,
937
+ skip_special_tokens=True
938
+ )[0].strip()
939
+
940
+ if translated_sentence:
941
+ translated_sentences.append(translated_sentence)
942
+ else:
943
+ translated_sentences.append(f"[Translation failed for: {sentence}]")
944
+
945
+ except Exception as e:
946
+ logger.error(f"Error translating sentence {i+1}: {e}")
947
+ translated_sentences.append(f"[Translation error: {str(e)}]")
948
+
949
+ # Combine translated sentences
950
+ translated_text = ' '.join(translated_sentences)
951
+
952
+ if not translated_text or translated_text.startswith("[Translation error"):
953
+ raise HTTPException(status_code=500, detail="Translation returned empty result or encountered errors")
954
+
955
+ # Clean up translation
956
+ translated_text = re.sub(r'\s+', ' ', translated_text).strip()
957
+
958
+ # Save to cache
959
+ await save_translation_to_cache(text_hash, target_language_lower, translated_text, "english")
960
+
961
+ return JSONResponse({
962
+ "file_hash": file_hash,
963
+ "translated_text": translated_text,
964
+ "target_language": target_language,
965
+ "from_cache": False,
966
+ "source_language": "english",
967
+ "used_english_reference": True
968
+ })
969
+
970
+ except Exception as e:
971
+ logger.error(f"Translation from audio error: {e}")
972
+ raise HTTPException(status_code=500, detail=f"Translation failed: {str(e)}")
973
+
974
+ @app.get("/audio_info/{file_hash}")
975
+ async def get_audio_info(file_hash: str):
976
+ """Get information about cached audio file including English reference availability"""
977
+ try:
978
+ with db_manager.get_connection() as conn:
979
+ cursor = conn.cursor()
980
+ cursor.execute(
981
+ '''SELECT filename, file_size, language, english_reference,
982
+ created_at, last_accessed FROM cache WHERE file_hash = ?''',
983
+ (file_hash,)
984
+ )
985
+ result = cursor.fetchone()
986
+
987
+ if not result:
988
+ raise HTTPException(status_code=404, detail="Audio file not found in cache")
989
+
990
+ return JSONResponse({
991
+ "file_hash": file_hash,
992
+ "filename": result[0],
993
+ "file_size": result[1],
994
+ "detected_language": result[2],
995
+ "has_english_reference": bool(result[3]),
996
+ "english_reference_preview": result[3][:100] + "..." if result[3] and len(result[3]) > 100 else result[3],
997
+ "created_at": result[4],
998
+ "last_accessed": result[5]
999
+ })
1000
+ except HTTPException:
1001
+ raise
1002
+ except Exception as e:
1003
+ logger.error(f"Error getting audio info: {e}")
1004
+ raise HTTPException(status_code=500, detail="Failed to retrieve audio information")
1005
+
1006
  @app.get("/languages")
1007
  async def get_supported_languages():
1008
  """Get list of supported languages for translation"""
 
1019
  cached_result = await get_from_cache(file_hash)
1020
  if cached_result:
1021
  await remove_processing_status(file_hash)
1022
+ cached_result.update({
 
1023
  "status": "completed",
1024
  "from_cache": True,
1025
  "message": "Processing completed and result is ready"
1026
  })
1027
+ return JSONResponse(cached_result)
1028
 
1029
  # Check processing status
1030
  processing_status = await get_processing_status(file_hash)
 
1056
  "translation_loaded": translation_model is not None
1057
  }
1058
 
1059
+ @app.delete("/cache/{file_hash}")
1060
+ async def delete_from_cache(file_hash: str):
1061
+ """Delete a specific file from cache"""
1062
+ try:
1063
+ with db_manager.get_connection() as conn:
1064
+ cursor = conn.cursor()
1065
+ cursor.execute('DELETE FROM cache WHERE file_hash = ?', (file_hash,))
1066
+ cursor.execute('DELETE FROM processing_status WHERE file_hash = ?', (file_hash,))
1067
+ conn.commit()
1068
+
1069
+ if cursor.rowcount > 0:
1070
+ return JSONResponse({"message": f"File {file_hash} deleted from cache"})
1071
+ else:
1072
+ raise HTTPException(status_code=404, detail="File not found in cache")
1073
+ except HTTPException:
1074
+ raise
1075
+ except Exception as e:
1076
+ logger.error(f"Error deleting from cache: {e}")
1077
+ raise HTTPException(status_code=500, detail="Failed to delete from cache")
1078
+
1079
+ @app.get("/cache/stats")
1080
+ async def get_cache_stats():
1081
+ """Get cache statistics"""
1082
+ try:
1083
+ with db_manager.get_connection() as conn:
1084
+ cursor = conn.cursor()
1085
+
1086
+ # Cache statistics
1087
+ cursor.execute('SELECT COUNT(*), AVG(file_size) FROM cache')
1088
+ cache_stats = cursor.fetchone()
1089
+
1090
+ cursor.execute('SELECT COUNT(*) FROM cache WHERE english_reference IS NOT NULL')
1091
+ english_ref_count = cursor.fetchone()[0]
1092
+
1093
+ cursor.execute('SELECT COUNT(*) FROM translation_cache')
1094
+ translation_count = cursor.fetchone()[0]
1095
+
1096
+ cursor.execute('''SELECT language, COUNT(*) FROM cache
1097
+ WHERE language IS NOT NULL
1098
+ GROUP BY language ORDER BY COUNT(*) DESC''')
1099
+ language_stats = cursor.fetchall()
1100
+
1101
+ cursor.execute('''SELECT target_language, COUNT(*) FROM translation_cache
1102
+ GROUP BY target_language ORDER BY COUNT(*) DESC''')
1103
+ translation_stats = cursor.fetchall()
1104
+
1105
+ return JSONResponse({
1106
+ "total_cached_files": cache_stats[0] or 0,
1107
+ "average_file_size_bytes": int(cache_stats[1] or 0),
1108
+ "files_with_english_reference": english_ref_count,
1109
+ "total_translations": translation_count,
1110
+ "language_distribution": dict(language_stats),
1111
+ "translation_language_distribution": dict(translation_stats)
1112
+ })
1113
+ except Exception as e:
1114
+ logger.error(f"Error getting cache stats: {e}")
1115
+ raise HTTPException(status_code=500, detail="Failed to retrieve cache statistics")
1116
+
1117
  if __name__ == "__main__":
1118
  uvicorn.run(
1119
  app,