aradhyapavan commited on
Commit
09d964c
·
verified ·
1 Parent(s): 0dc9527

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +10 -38
app.py CHANGED
@@ -142,48 +142,20 @@ def compute_pdf_hash(pdf_path):
142
  return hashlib.md5(f.read()).hexdigest()
143
 
144
  def get_total_pages(pdf_path):
145
- """Get total pages robustly and extract text per page."""
146
  reader = PdfReader(pdf_path)
147
- # Handle PDFs that are flagged as encrypted but openable without a password
148
- try:
149
- if getattr(reader, "is_encrypted", False):
150
- try:
151
- reader.decrypt("")
152
- except Exception:
153
- pass
154
- except Exception:
155
- pass
156
-
157
- try:
158
- num_pages = len(reader.pages)
159
- except Exception:
160
- # Fallback: iterate until failure (very rare)
161
- num_pages = 0
162
- try:
163
- while True:
164
- _ = reader.pages[num_pages]
165
- num_pages += 1
166
- except Exception:
167
- pass
168
-
169
  documents = []
170
- for idx in range(num_pages):
171
- page_num = idx + 1
172
- try:
173
- page = reader.pages[idx]
174
- text = page.extract_text() or ""
175
- except Exception:
176
- text = ""
177
-
178
- class Doc:
179
  def __init__(self, content, page_number):
180
  self.content = content
181
  self.page_number = page_number
182
-
183
- documents.append(Doc(text, page_num))
184
-
185
- print(f"PDF detected pages: {num_pages} for {os.path.basename(pdf_path)}")
186
- return num_pages, documents
187
 
188
  def get_embeddings(text):
189
  """Generate embeddings for text."""
@@ -928,4 +900,4 @@ if __name__ == '__main__':
928
  os.makedirs(app.config['UPLOAD_FOLDER'], exist_ok=True)
929
  os.makedirs(FAISS_INDEX_DIR, exist_ok=True)
930
  port = int(os.getenv("PORT", "7860"))
931
- app.run(host="0.0.0.0", port=port)
 
142
  return hashlib.md5(f.read()).hexdigest()
143
 
144
  def get_total_pages(pdf_path):
145
+ """Get total pages and extract text using PyPDF2 for fast processing."""
146
  reader = PdfReader(pdf_path)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
147
  documents = []
148
+ for page_num, page in enumerate(reader.pages, 1):
149
+ text = page.extract_text() or ""
150
+
151
+ class Doc:
 
 
 
 
 
152
  def __init__(self, content, page_number):
153
  self.content = content
154
  self.page_number = page_number
155
+
156
+ doc = Doc(text, page_num)
157
+ documents.append(doc)
158
+ return len(documents), documents
 
159
 
160
  def get_embeddings(text):
161
  """Generate embeddings for text."""
 
900
  os.makedirs(app.config['UPLOAD_FOLDER'], exist_ok=True)
901
  os.makedirs(FAISS_INDEX_DIR, exist_ok=True)
902
  port = int(os.getenv("PORT", "7860"))
903
+ app.run(host="0.0.0.0", port=port, debug=True)