Pranesh64 commited on
Commit
0721a21
Β·
verified Β·
1 Parent(s): 170ccaf

Create processing.py

Browse files
Files changed (1) hide show
  1. backend/processing.py +331 -0
backend/processing.py ADDED
@@ -0,0 +1,331 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Document processing module for loading and chunking documents.
3
+ Supports PDF, TXT, and MD files.
4
+ """
5
+
6
+ import re
7
+ import os
8
+ from typing import List, Dict
9
+ from io import BytesIO
10
+ import pypdf
11
+
12
+
13
+ def clean_text(text: str) -> str:
14
+ """Clean text by removing excessive whitespace and normalizing."""
15
+ # Remove excessive whitespace
16
+ text = re.sub(r'\s+', ' ', text)
17
+ # Remove leading/trailing whitespace
18
+ text = text.strip()
19
+ return text
20
+
21
+
22
+ def estimate_tokens(text: str) -> int:
23
+ """Rough token estimation: ~4 characters per token."""
24
+ return len(text) // 4
25
+
26
+
27
+ def chunk_text(text: str, chunk_size: int = 800, overlap: int = 150) -> List[str]:
28
+ """
29
+ Chunk text into overlapping windows based on token count.
30
+
31
+ Args:
32
+ text: Input text to chunk
33
+ chunk_size: Target token count per chunk
34
+ overlap: Token overlap between chunks
35
+
36
+ Returns:
37
+ List of text chunks
38
+ """
39
+ if not text:
40
+ return []
41
+
42
+ # Estimate tokens and convert to character-based chunking
43
+ # Approximate: 4 chars per token
44
+ char_chunk_size = chunk_size * 4
45
+ char_overlap = overlap * 4
46
+
47
+ chunks = []
48
+ start = 0
49
+ text_length = len(text)
50
+
51
+ while start < text_length:
52
+ end = start + char_chunk_size
53
+
54
+ # If this is not the last chunk, try to break at sentence boundary
55
+ if end < text_length:
56
+ # Look for sentence endings within the last 200 chars
57
+ sentence_end = max(
58
+ text.rfind('.', start, end),
59
+ text.rfind('!', start, end),
60
+ text.rfind('?', start, end),
61
+ text.rfind('\n', start, end)
62
+ )
63
+
64
+ if sentence_end > start + char_chunk_size // 2:
65
+ end = sentence_end + 1
66
+
67
+ chunk = text[start:end].strip()
68
+ if chunk:
69
+ chunks.append(chunk)
70
+
71
+ # Move start position with overlap
72
+ start = end - char_overlap
73
+ if start >= text_length:
74
+ break
75
+
76
+ return chunks
77
+
78
+
79
+ def load_pdf_from_path(file_path: str) -> str:
80
+ """
81
+ Load text from PDF file path.
82
+
83
+ Args:
84
+ file_path: Path to PDF file
85
+
86
+ Returns:
87
+ Extracted text content
88
+ """
89
+ try:
90
+ with open(file_path, 'rb') as file:
91
+ reader = pypdf.PdfReader(file)
92
+ text_parts = []
93
+
94
+ for page in reader.pages:
95
+ text = page.extract_text()
96
+ if text:
97
+ text_parts.append(text)
98
+
99
+ full_text = '\n\n'.join(text_parts)
100
+ return clean_text(full_text)
101
+ except Exception as e:
102
+ raise ValueError(f"Error loading PDF {file_path}: {str(e)}")
103
+
104
+
105
+ def load_pdf(file_content: bytes, filename: str) -> str:
106
+ """
107
+ Load text from PDF file bytes.
108
+
109
+ Args:
110
+ file_content: PDF file bytes
111
+ filename: Original filename
112
+
113
+ Returns:
114
+ Extracted text content
115
+ """
116
+ try:
117
+ pdf_file = BytesIO(file_content)
118
+ reader = pypdf.PdfReader(pdf_file)
119
+ text_parts = []
120
+
121
+ for page in reader.pages:
122
+ text = page.extract_text()
123
+ if text:
124
+ text_parts.append(text)
125
+
126
+ full_text = '\n\n'.join(text_parts)
127
+ return clean_text(full_text)
128
+ except Exception as e:
129
+ raise ValueError(f"Error loading PDF {filename}: {str(e)}")
130
+
131
+
132
+ def load_text_file_from_path(file_path: str) -> str:
133
+ """
134
+ Load text from TXT or MD file path.
135
+
136
+ Args:
137
+ file_path: Path to text file
138
+
139
+ Returns:
140
+ Text content
141
+ """
142
+ try:
143
+ # Try UTF-8 first, fallback to latin-1
144
+ try:
145
+ with open(file_path, 'r', encoding='utf-8') as file:
146
+ text = file.read()
147
+ except UnicodeDecodeError:
148
+ with open(file_path, 'r', encoding='latin-1') as file:
149
+ text = file.read()
150
+
151
+ return clean_text(text)
152
+ except Exception as e:
153
+ raise ValueError(f"Error loading text file {file_path}: {str(e)}")
154
+
155
+
156
+ def load_text_file(file_content: bytes, filename: str) -> str:
157
+ """
158
+ Load text from TXT or MD file bytes.
159
+
160
+ Args:
161
+ file_content: File bytes
162
+ filename: Original filename
163
+
164
+ Returns:
165
+ Text content
166
+ """
167
+ try:
168
+ # Try UTF-8 first, fallback to latin-1
169
+ try:
170
+ text = file_content.decode('utf-8')
171
+ except UnicodeDecodeError:
172
+ text = file_content.decode('latin-1')
173
+
174
+ return clean_text(text)
175
+ except Exception as e:
176
+ raise ValueError(f"Error loading text file {filename}: {str(e)}")
177
+
178
+
179
+ def process_documents(uploaded_files: List) -> List[Dict]:
180
+ """
181
+ Process uploaded files and return chunked documents.
182
+ Works with Gradio file objects (which are file paths as strings).
183
+
184
+ Args:
185
+ uploaded_files: List of file paths (strings) from Gradio
186
+
187
+ Returns:
188
+ List of dictionaries with 'text', 'source', and 'chunk_id' keys
189
+ """
190
+ all_chunks = []
191
+
192
+ print(f"πŸ“„ Processing {len(uploaded_files)} uploaded files...")
193
+
194
+ for file_path in uploaded_files:
195
+ try:
196
+ # Extract filename from path
197
+ filename = os.path.basename(file_path)
198
+ file_extension = filename.split('.')[-1].lower()
199
+
200
+ print(f"πŸ”„ Processing: {filename}")
201
+
202
+ # Load document based on type
203
+ if file_extension == 'pdf':
204
+ text = load_pdf_from_path(file_path)
205
+ elif file_extension in ['txt', 'md']:
206
+ text = load_text_file_from_path(file_path)
207
+ else:
208
+ print(f"⚠️ Skipping unsupported file: {filename}")
209
+ continue # Skip unsupported files
210
+
211
+ if not text:
212
+ print(f"⚠️ No text extracted from: {filename}")
213
+ continue
214
+
215
+ # Chunk the text
216
+ chunks = chunk_text(text, chunk_size=800, overlap=150)
217
+
218
+ # Store chunks with metadata
219
+ for idx, chunk in enumerate(chunks):
220
+ all_chunks.append({
221
+ 'text': chunk,
222
+ 'source': filename,
223
+ 'chunk_id': f"{filename}_chunk_{idx}",
224
+ 'chunk_index': idx
225
+ })
226
+
227
+ print(f"βœ… Created {len(chunks)} chunks from {filename}")
228
+
229
+ except Exception as e:
230
+ print(f"❌ Error processing {file_path}: {str(e)}")
231
+ continue
232
+
233
+ print(f"βœ… Total chunks created: {len(all_chunks)}")
234
+ return all_chunks
235
+
236
+
237
+ # Additional functions for local directory processing
238
+ def get_available_files(data_dir: str = "data") -> List[str]:
239
+ """
240
+ Get list of available files in the data directory.
241
+
242
+ Args:
243
+ data_dir: Directory path containing documents
244
+
245
+ Returns:
246
+ List of filenames
247
+ """
248
+ if not os.path.exists(data_dir):
249
+ return []
250
+
251
+ supported_extensions = {'.pdf', '.txt', '.md'}
252
+ files = []
253
+
254
+ for filename in os.listdir(data_dir):
255
+ file_path = os.path.join(data_dir, filename)
256
+ if os.path.isfile(file_path):
257
+ _, ext = os.path.splitext(filename)
258
+ if ext.lower() in supported_extensions:
259
+ files.append(filename)
260
+
261
+ return sorted(files)
262
+
263
+
264
+ def process_documents_from_directory(data_dir: str = "data") -> List[Dict]:
265
+ """
266
+ Process all documents in the data directory and return chunked documents.
267
+
268
+ Args:
269
+ data_dir: Directory path containing documents
270
+
271
+ Returns:
272
+ List of dictionaries with 'text', 'source', and 'chunk_id' keys
273
+ """
274
+ if not os.path.exists(data_dir):
275
+ raise FileNotFoundError(f"Data directory '{data_dir}' not found")
276
+
277
+ all_chunks = []
278
+ supported_extensions = {'.pdf', '.txt', '.md'}
279
+
280
+ # Get all files in the data directory
281
+ files = []
282
+ for filename in os.listdir(data_dir):
283
+ file_path = os.path.join(data_dir, filename)
284
+ if os.path.isfile(file_path):
285
+ _, ext = os.path.splitext(filename)
286
+ if ext.lower() in supported_extensions:
287
+ files.append((filename, file_path))
288
+
289
+ if not files:
290
+ raise ValueError(f"No supported files found in '{data_dir}' directory")
291
+
292
+ print(f"πŸ“„ Processing {len(files)} files from '{data_dir}' directory...")
293
+
294
+ for filename, file_path in files:
295
+ try:
296
+ file_extension = os.path.splitext(filename)[1].lower()
297
+
298
+ # Load document based on type
299
+ if file_extension == '.pdf':
300
+ text = load_pdf_from_path(file_path)
301
+ print(f"βœ… Loaded PDF: {filename}")
302
+ elif file_extension in ['.txt', '.md']:
303
+ text = load_text_file_from_path(file_path)
304
+ print(f"βœ… Loaded text file: {filename}")
305
+ else:
306
+ continue # Skip unsupported files
307
+
308
+ if not text:
309
+ print(f"⚠️ No text extracted from: {filename}")
310
+ continue
311
+
312
+ # Chunk the text
313
+ chunks = chunk_text(text, chunk_size=800, overlap=150)
314
+
315
+ # Store chunks with metadata
316
+ for idx, chunk in enumerate(chunks):
317
+ all_chunks.append({
318
+ 'text': chunk,
319
+ 'source': filename,
320
+ 'chunk_id': f"{filename}_chunk_{idx}",
321
+ 'chunk_index': idx
322
+ })
323
+
324
+ print(f" β†’ Created {len(chunks)} chunks")
325
+
326
+ except Exception as e:
327
+ print(f"❌ Error processing {filename}: {str(e)}")
328
+ continue
329
+
330
+ print(f"βœ… Total chunks created: {len(all_chunks)}")
331
+ return all_chunks