yousbek commited on
Commit
210c948
·
verified ·
1 Parent(s): 977767b

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +74 -77
main.py CHANGED
@@ -12,7 +12,7 @@ from transformers import (
12
  )
13
  from utils import extract_text, save_file, verify_summary, ensure_complete_sentences
14
  from utils import convert_to_text, save_translated_file
15
- from langdetect import detect, DetectorFactory, LangDetectException
16
  from langcodes import Language
17
  import torch
18
  from huggingface_hub import InferenceClient
@@ -103,14 +103,15 @@ def split_text(text, max_tokens=900):
103
  chunks.append(summary_tokenizer.convert_tokens_to_string(current_chunk))
104
 
105
  return chunks
106
-
 
107
  # Document & Image Analysis (Summarization & Interpretation)
108
  @app.post("/docsum_imginter")
109
  async def docsum_imginter(file: UploadFile = File(...), task: str = Form(...)):
110
  file_path = UPLOAD_DIR / file.filename
111
  output_filename = f"summarized_{file.filename}"
112
  output_path = PROCESSED_DIR / output_filename
113
-
114
  with open(file_path, "wb") as f:
115
  shutil.copyfileobj(file.file, f)
116
 
@@ -122,21 +123,21 @@ async def docsum_imginter(file: UploadFile = File(...), task: str = Form(...)):
122
 
123
  if len(text.strip().split()) < 150:
124
  raise HTTPException(
125
- 400,
126
- "WallD thinks the file is too small for summarization - minimum 150 words",
127
- )
128
 
129
  text = text.encode("ascii", "ignore").decode("ascii")
130
 
131
  chunks = split_text(text)
132
  summaries = []
133
  prompt = (
134
- "Generate a concise, factual summary covering ALL key sections of the text. "
135
- "Include: main objectives, critical details, and outcomes if mentioned. "
136
- "Never include: contact information, website links, or promotional content. "
137
- "\n"
138
- "Text to summarize:\n{chunk}"
139
- )
140
 
141
  for chunk in chunks:
142
  word_count = len(chunk.split())
@@ -145,21 +146,21 @@ async def docsum_imginter(file: UploadFile = File(...), task: str = Form(...)):
145
  continue
146
 
147
  max_length = min(
148
- max(int(word_count * 0.4), 150),
149
- 512,
150
- )
151
  summary_result = summarizer(
152
- prompt.format(chunk=chunk),
153
- max_length=max_length,
154
- min_length=max(150, int(max_length * 0.6)),
155
- do_sample=False,
156
- truncation=True,
157
- repetition_penalty=1.5,
158
- no_repeat_ngram_size=3,
159
- early_stopping=False,
160
- num_beams=4,
161
- length_penalty=1.0,
162
- )
163
 
164
  if summary_result:
165
  raw_summary = summary_result[0]["summary_text"]
@@ -170,46 +171,46 @@ async def docsum_imginter(file: UploadFile = File(...), task: str = Form(...)):
170
 
171
  if not summaries:
172
  raise HTTPException(
173
- 500, "Summary verification failed - no valid content extracted"
174
- )
175
 
176
  full_summary = "\n".join(filter(None, summaries))
177
  if len(summaries) > 1:
178
  full_summary = summarizer(
179
- f"Combine these partial summaries into one coherent paragraph:\n{full_summary}",
180
- max_length=512,
181
- )[0]["summary_text"]
182
 
183
  if not full_summary.strip():
184
  sentences = [s.strip() for s in text.split(".") if s.strip()]
185
  full_summary = (
186
- ". ".join(sentences[:3]) + "." if sentences else text[:500]
187
- )
188
 
189
  save_file(full_summary, file_path, file_type, output_path)
190
  return FileResponse(output_path, filename=output_filename)
191
 
192
  elif task.lower() == "interpret":
193
- try:
194
  with Image.open(file_path) as image:
195
  if image.mode != "RGB":
196
  image = image.convert("RGB")
197
-
198
  inputs = processor(images=image, return_tensors="pt")
199
  if inputs is None or "pixel_values" not in inputs:
200
  raise ValueError("Image processing failed: No valid inputs generated.")
201
-
202
  outputs = interpretation_model.generate(**inputs, repetition_penalty=1.2)
203
  if outputs is None:
204
  raise ValueError("Model generation failed: No outputs produced.")
205
-
206
  caption = processor.decode(outputs[0], skip_special_tokens=True)
207
 
208
  return {"caption": caption if caption else "No caption generated"}
209
 
210
  except Exception as e:
211
  raise HTTPException(status_code=500, detail=f"Inference failed: {str(e)}")
212
-
213
  finally:
214
  if file_path.exists():
215
  file_path.unlink()
@@ -221,7 +222,6 @@ async def ask(file: UploadFile = File(...), question: str = Form(...)):
221
  try:
222
  file_type = file.filename.split(".")[-1].lower()
223
  file_path = UPLOAD_DIR / file.filename
224
-
225
 
226
  with open(file_path, "wb") as f:
227
  shutil.copyfileobj(file.file, f)
@@ -244,7 +244,6 @@ async def ask(file: UploadFile = File(...), question: str = Form(...)):
244
  detail="The File doesn't contain any text.",
245
  )
246
 
247
-
248
  result = question_answering(question=question, context=text)
249
  return {"answer": result["answer"]}
250
 
@@ -252,11 +251,10 @@ async def ask(file: UploadFile = File(...), question: str = Form(...)):
252
  raise HTTPException(
253
  status_code=500, detail=f"Error processing request: {str(e)}"
254
  )
255
-
256
  finally:
257
  if file_path.exists():
258
  file_path.unlink()
259
-
260
 
261
 
262
  # Data Visualization Code Generation
@@ -318,10 +316,10 @@ async def visualization(file: UploadFile = File(...), request: str = Form(...)):
318
  exec(executable_code, exec_globals)
319
  plt.savefig(plot_path, bbox_inches="tight")
320
  plt.close()
321
-
322
  except Exception as e:
323
  raise HTTPException(status_code=500, detail=f"Error executing code: {str(e)}")
324
-
325
  finally:
326
  if file_path.exists():
327
  file_path.unlink()
@@ -335,43 +333,38 @@ async def visualization(file: UploadFile = File(...), request: str = Form(...)):
335
 
336
  # Text Translation
337
 
338
- LANGUAGE_CODE_MAPPING = {
339
- 'french': 'fr','fr': 'fr',
340
- 'english': 'en','en': 'en',
341
- 'spanish': 'es','es': 'es',
342
- 'german': 'de','de': 'de',
343
- 'italian': 'it', 'it': 'it',
344
- 'portuguese': 'pt','pt': 'pt',
345
- 'dutch': 'nl','nl': 'nl',
346
- 'russian': 'ru','ru': 'ru',
347
- 'chinese': 'zh','zh': 'zh',
348
- 'japanese': 'ja','ja': 'ja',
349
- 'arabic': 'ar','ar': 'ar',
350
- }
 
 
 
 
 
 
 
 
 
 
 
 
351
 
352
 
353
  async def translate_large_text(text, target_lang="fr"):
354
- # Détection automatique de la langue source
355
- try:
356
- src_lang = detect(text)
357
- # Si la détection échoue ou retourne un code incorrect, on utilise l'anglais par défaut
358
- if not src_lang or len(src_lang) != 2:
359
- src_lang = "en"
360
- except:
361
- src_lang = "en"
362
-
363
- # Normalisation du code de langue cible
364
- target_lang = target_lang.lower().strip()
365
- target_lang = LANGUAGE_CODE_MAPPING.get(target_lang, target_lang[:2])
366
-
367
  chunks = split_tran_text_trans(text)
368
  translated_chunks = []
369
 
370
- try:
371
- translation_tokenizer.src_lang = src_lang
372
- except:
373
- # Si la langue source n'est pas supportée, on utilise l'anglais par défaut
374
- translation_tokenizer.src_lang = "en"
375
 
376
  for chunk in chunks:
377
  try:
@@ -385,14 +378,15 @@ async def translate_large_text(text, target_lang="fr"):
385
  translated_chunks.append(translated)
386
  except Exception as e:
387
  print(f"Error translating chunk: {str(e)}")
388
- translated_chunks.append(chunk) # keep the original in case of error
389
 
390
  return "\n\n".join(translated_chunks)
391
 
 
392
  @app.post("/translate")
393
  async def translate_document(
394
  file: UploadFile = File(...),
395
- target_language: str = "fr", # peut être "french", "français", "fr", etc.
396
  ):
397
  try:
398
  text = await convert_to_text(file)
@@ -414,3 +408,6 @@ async def translate_document(
414
 
415
  except Exception as e:
416
  raise HTTPException(status_code=500, detail=str(e))
 
 
 
 
12
  )
13
  from utils import extract_text, save_file, verify_summary, ensure_complete_sentences
14
  from utils import convert_to_text, save_translated_file
15
+ from langdetect import detect, DetectorFactory
16
  from langcodes import Language
17
  import torch
18
  from huggingface_hub import InferenceClient
 
103
  chunks.append(summary_tokenizer.convert_tokens_to_string(current_chunk))
104
 
105
  return chunks
106
+
107
+
108
  # Document & Image Analysis (Summarization & Interpretation)
109
  @app.post("/docsum_imginter")
110
  async def docsum_imginter(file: UploadFile = File(...), task: str = Form(...)):
111
  file_path = UPLOAD_DIR / file.filename
112
  output_filename = f"summarized_{file.filename}"
113
  output_path = PROCESSED_DIR / output_filename
114
+
115
  with open(file_path, "wb") as f:
116
  shutil.copyfileobj(file.file, f)
117
 
 
123
 
124
  if len(text.strip().split()) < 150:
125
  raise HTTPException(
126
+ 400,
127
+ "WallD thinks the file is too small for summarization - minimum 150 words",
128
+ )
129
 
130
  text = text.encode("ascii", "ignore").decode("ascii")
131
 
132
  chunks = split_text(text)
133
  summaries = []
134
  prompt = (
135
+ "Generate a concise, factual summary covering ALL key sections of the text. "
136
+ "Include: main objectives, critical details, and outcomes if mentioned. "
137
+ "Never include: contact information, website links, or promotional content. "
138
+ "\n"
139
+ "Text to summarize:\n{chunk}"
140
+ )
141
 
142
  for chunk in chunks:
143
  word_count = len(chunk.split())
 
146
  continue
147
 
148
  max_length = min(
149
+ max(int(word_count * 0.4), 150),
150
+ 512,
151
+ )
152
  summary_result = summarizer(
153
+ prompt.format(chunk=chunk),
154
+ max_length=max_length,
155
+ min_length=max(150, int(max_length * 0.6)),
156
+ do_sample=False,
157
+ truncation=True,
158
+ repetition_penalty=1.5,
159
+ no_repeat_ngram_size=3,
160
+ early_stopping=False,
161
+ num_beams=4,
162
+ length_penalty=1.0,
163
+ )
164
 
165
  if summary_result:
166
  raw_summary = summary_result[0]["summary_text"]
 
171
 
172
  if not summaries:
173
  raise HTTPException(
174
+ 500, "Summary verification failed - no valid content extracted"
175
+ )
176
 
177
  full_summary = "\n".join(filter(None, summaries))
178
  if len(summaries) > 1:
179
  full_summary = summarizer(
180
+ f"Combine these partial summaries into one coherent paragraph:\n{full_summary}",
181
+ max_length=512,
182
+ )[0]["summary_text"]
183
 
184
  if not full_summary.strip():
185
  sentences = [s.strip() for s in text.split(".") if s.strip()]
186
  full_summary = (
187
+ ". ".join(sentences[:3]) + "." if sentences else text[:500]
188
+ )
189
 
190
  save_file(full_summary, file_path, file_type, output_path)
191
  return FileResponse(output_path, filename=output_filename)
192
 
193
  elif task.lower() == "interpret":
194
+ try:
195
  with Image.open(file_path) as image:
196
  if image.mode != "RGB":
197
  image = image.convert("RGB")
198
+
199
  inputs = processor(images=image, return_tensors="pt")
200
  if inputs is None or "pixel_values" not in inputs:
201
  raise ValueError("Image processing failed: No valid inputs generated.")
202
+
203
  outputs = interpretation_model.generate(**inputs, repetition_penalty=1.2)
204
  if outputs is None:
205
  raise ValueError("Model generation failed: No outputs produced.")
206
+
207
  caption = processor.decode(outputs[0], skip_special_tokens=True)
208
 
209
  return {"caption": caption if caption else "No caption generated"}
210
 
211
  except Exception as e:
212
  raise HTTPException(status_code=500, detail=f"Inference failed: {str(e)}")
213
+
214
  finally:
215
  if file_path.exists():
216
  file_path.unlink()
 
222
  try:
223
  file_type = file.filename.split(".")[-1].lower()
224
  file_path = UPLOAD_DIR / file.filename
 
225
 
226
  with open(file_path, "wb") as f:
227
  shutil.copyfileobj(file.file, f)
 
244
  detail="The File doesn't contain any text.",
245
  )
246
 
 
247
  result = question_answering(question=question, context=text)
248
  return {"answer": result["answer"]}
249
 
 
251
  raise HTTPException(
252
  status_code=500, detail=f"Error processing request: {str(e)}"
253
  )
254
+
255
  finally:
256
  if file_path.exists():
257
  file_path.unlink()
 
258
 
259
 
260
  # Data Visualization Code Generation
 
316
  exec(executable_code, exec_globals)
317
  plt.savefig(plot_path, bbox_inches="tight")
318
  plt.close()
319
+
320
  except Exception as e:
321
  raise HTTPException(status_code=500, detail=f"Error executing code: {str(e)}")
322
+
323
  finally:
324
  if file_path.exists():
325
  file_path.unlink()
 
333
 
334
  # Text Translation
335
 
336
+ def split_tran_text_trans(text, max_chunk_size=400):
337
+ chunks = []
338
+ current_chunk = []
339
+ current_length = 0
340
+
341
+ paragraphs = re.split(r'\n\n+', text)
342
+
343
+ for para in paragraphs:
344
+ para = para.strip()
345
+ if not para:
346
+ continue
347
+
348
+ if current_length + len(para.split()) <= max_chunk_size:
349
+ current_chunk.append(para)
350
+ current_length += len(para.split())
351
+ else:
352
+ if current_chunk:
353
+ chunks.append("\n\n".join(current_chunk))
354
+ current_chunk = [para]
355
+ current_length = len(para.split())
356
+
357
+ if current_chunk:
358
+ chunks.append("\n\n".join(current_chunk))
359
+
360
+ return chunks
361
 
362
 
363
  async def translate_large_text(text, target_lang="fr"):
 
 
 
 
 
 
 
 
 
 
 
 
 
364
  chunks = split_tran_text_trans(text)
365
  translated_chunks = []
366
 
367
+ translation_tokenizer.src_lang = "en"
 
 
 
 
368
 
369
  for chunk in chunks:
370
  try:
 
378
  translated_chunks.append(translated)
379
  except Exception as e:
380
  print(f"Error translating chunk: {str(e)}")
381
+ translated_chunks.append(chunk) # keep the original in case u got an error
382
 
383
  return "\n\n".join(translated_chunks)
384
 
385
+
386
  @app.post("/translate")
387
  async def translate_document(
388
  file: UploadFile = File(...),
389
+ target_language: str = "fr", # default target language
390
  ):
391
  try:
392
  text = await convert_to_text(file)
 
408
 
409
  except Exception as e:
410
  raise HTTPException(status_code=500, detail=str(e))
411
+
412
+ except Exception as e:
413
+ raise HTTPException(status_code=500, detail=str(e))