tejovanth commited on
Commit
8549f68
Β·
verified Β·
1 Parent(s): 24d9906

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +17 -8
app.py CHANGED
@@ -19,14 +19,21 @@ except Exception as e:
19
 
20
  def summarize_file(file):
21
  start = time.time()
22
- print(f"File: {file.name if hasattr(file, 'name') else 'unknown'}")
 
 
23
  try:
24
- file_bytes = file.read() if hasattr(file, 'read') else file
25
- mime, _ = mimetypes.guess_type(file.name) if hasattr(file, 'name') else (None, None)
 
 
26
  text = ""
27
  if mime == 'application/pdf':
28
- doc = fitz.open(stream=file_bytes, filetype="pdf")
29
- text = "".join(page.get_text("text") for page in doc)
 
 
 
30
  elif mime in ['text/plain', 'text/rtf']:
31
  text = rtf_to_text(file_bytes.decode("utf-8", errors="ignore")) if mime == 'text/rtf' else file_bytes.decode("utf-8", errors="ignore")
32
  elif mime in ['text/csv', 'application/vnd.ms-excel']:
@@ -42,14 +49,16 @@ def summarize_file(file):
42
  text = " ".join(df.astype(str).values.flatten())
43
  else:
44
  text = textract.process(file_bytes).decode("utf-8", errors="ignore")
 
 
45
  text = re.sub(r"\$\s*([^$]+)\s*\$", r"\1", text)
46
  text = re.sub(r"\\cap", "intersection", text)
47
  text = re.sub(r"\s+", " ", text).strip()
48
- text = "".join(c for c in text if ord(c) < 128)
 
49
  print(f"Extracted chars: {len(text)}")
50
  except Exception as e:
51
  return f"❌ Text extraction failed: {str(e)}"
52
- if not text.strip(): return "❌ No text found"
53
  text = text[:300000]
54
  chunks = [text[i:i+1000] for i in range(0, len(text), 1000)]
55
  print(f"Chunks created: {len(chunks)}")
@@ -67,7 +76,7 @@ def summarize_file(file):
67
  summaries.append(f"**Chunk {i+1}**:\n{summary}")
68
  except Exception as e:
69
  summaries.append(f"**Chunk {i+1}**: ❌ Error: {str(e)}")
70
- # Pad if <12 summaries
71
  while len(summaries) < 12:
72
  summaries.append(f"**Chunk {len(summaries)+1}**: Insufficient content for full summary")
73
  return f"**Chars**: {len(text)}\n**Time**: {time.time()-start:.2f}s\n\n" + "\n\n".join(summaries[:12])
 
19
 
20
  def summarize_file(file):
21
  start = time.time()
22
+ if not hasattr(file, 'read') or not hasattr(file, 'name'):
23
+ return "❌ Invalid file: Missing read() or name attribute"
24
+ print(f"File: {file.name}")
25
  try:
26
+ file_bytes = file.read()
27
+ if not isinstance(file_bytes, bytes) or len(file_bytes) == 0:
28
+ return "❌ Invalid file: Empty or non-binary content"
29
+ mime, _ = mimetypes.guess_type(file.name) or ('text/plain', None)
30
  text = ""
31
  if mime == 'application/pdf':
32
+ try:
33
+ doc = fitz.open(stream=file_bytes, filetype="pdf")
34
+ text = "".join(page.get_text("text") for page in doc)
35
+ except:
36
+ return "❌ PDF parsing failed"
37
  elif mime in ['text/plain', 'text/rtf']:
38
  text = rtf_to_text(file_bytes.decode("utf-8", errors="ignore")) if mime == 'text/rtf' else file_bytes.decode("utf-8", errors="ignore")
39
  elif mime in ['text/csv', 'application/vnd.ms-excel']:
 
49
  text = " ".join(df.astype(str).values.flatten())
50
  else:
51
  text = textract.process(file_bytes).decode("utf-8", errors="ignore")
52
+ # Strict text cleaning
53
+ text = re.sub(r"[^\x20-\x7E]", "", text) # Keep printable ASCII only
54
  text = re.sub(r"\$\s*([^$]+)\s*\$", r"\1", text)
55
  text = re.sub(r"\\cap", "intersection", text)
56
  text = re.sub(r"\s+", " ", text).strip()
57
+ if not text or len(text) < 100 or sum(1 for c in text if c.isalnum()) < 50:
58
+ return "❌ Extracted text invalid or too short"
59
  print(f"Extracted chars: {len(text)}")
60
  except Exception as e:
61
  return f"❌ Text extraction failed: {str(e)}"
 
62
  text = text[:300000]
63
  chunks = [text[i:i+1000] for i in range(0, len(text), 1000)]
64
  print(f"Chunks created: {len(chunks)}")
 
76
  summaries.append(f"**Chunk {i+1}**:\n{summary}")
77
  except Exception as e:
78
  summaries.append(f"**Chunk {i+1}**: ❌ Error: {str(e)}")
79
+ # Pad to 12 summaries
80
  while len(summaries) < 12:
81
  summaries.append(f"**Chunk {len(summaries)+1}**: Insufficient content for full summary")
82
  return f"**Chars**: {len(text)}\n**Time**: {time.time()-start:.2f}s\n\n" + "\n\n".join(summaries[:12])