tejovanth commited on
Commit
b26f983
·
verified ·
1 Parent(s): a725b8a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +13 -3
app.py CHANGED
@@ -2,7 +2,7 @@ import gradio as gr
2
  import fitz
3
  import torch
4
  from transformers import pipeline
5
- import time, logging
6
 
7
  logging.basicConfig(level=logging.ERROR)
8
  device = -1 # CPU-only
@@ -18,7 +18,14 @@ def summarize_file(file_bytes):
18
  start = time.time()
19
  print(f"File type: {type(file_bytes)}")
20
  try:
21
- text = "".join(page.get_text("text", flags=16) for page in fitz.open(stream=file_bytes, filetype="pdf")) if file_bytes[:4].startswith(b'%PDF') else file_bytes.decode("utf-8", errors="ignore")
 
 
 
 
 
 
 
22
  except Exception as e:
23
  return f"❌ Text extraction failed: {str(e)}"
24
  if not text.strip(): return "❌ No text found"
@@ -28,9 +35,12 @@ def summarize_file(file_bytes):
28
  if not chunks: return "❌ No chunks to summarize"
29
  summaries = []
30
  for i, chunk in enumerate(chunks):
31
- if time.time() - start > 15:
32
  summaries.append("⚠️ Stopped early")
33
  break
 
 
 
34
  try:
35
  summary = summarizer(chunk, max_length=60, min_length=10, do_sample=False)[0]['summary_text']
36
  summaries.append(f"**Chunk {i+1}**:\n{summary}")
 
2
  import fitz
3
  import torch
4
  from transformers import pipeline
5
+ import time, logging, re
6
 
7
  logging.basicConfig(level=logging.ERROR)
8
  device = -1 # CPU-only
 
18
  start = time.time()
19
  print(f"File type: {type(file_bytes)}")
20
  try:
21
+ doc = fitz.open(stream=file_bytes, filetype="pdf")
22
+ text = "".join(page.get_text("text") for page in doc)
23
+ # Clean OCR noise: replace LaTeX, remove excessive whitespace, non-ASCII
24
+ text = re.sub(r"\$\s*([^$]+)\s*\$", r"\1", text) # Strip $...$
25
+ text = re.sub(r"\\cap", "intersection", text) # Handle ∩
26
+ text = re.sub(r"\s+", " ", text).strip() # Normalize whitespace
27
+ text = "".join(c for c in text if ord(c) < 128) # ASCII only
28
+ print(f"Extracted chars: {len(text)}")
29
  except Exception as e:
30
  return f"❌ Text extraction failed: {str(e)}"
31
  if not text.strip(): return "❌ No text found"
 
35
  if not chunks: return "❌ No chunks to summarize"
36
  summaries = []
37
  for i, chunk in enumerate(chunks):
38
+ if time.time() - start > 20:
39
  summaries.append("⚠️ Stopped early")
40
  break
41
+ if sum(1 for c in chunk if not c.isalnum()) / len(chunk) > 0.5: # Skip equation-heavy chunks
42
+ summaries.append(f"**Chunk {i+1}**: Skipped (equation-heavy)")
43
+ continue
44
  try:
45
  summary = summarizer(chunk, max_length=60, min_length=10, do_sample=False)[0]['summary_text']
46
  summaries.append(f"**Chunk {i+1}**:\n{summary}")