Spaces:
Sleeping
Sleeping
| import io | |
| import pdfplumber | |
| import pandas as pd | |
| import json | |
| from docx import Document | |
| from openpyxl import load_workbook | |
| import re | |
| import uuid | |
| class FileReader: | |
| def __init__(self): | |
| self.allowed_files = ["txt", "pdf", "docx", "md", "json", "csv", "xlsx", "xls"] | |
| self.max_chars_per_file = 5000000 | |
| def calc_chars(self, files, allowed_chars): | |
| total_chars = 0 | |
| clean_contents = [] | |
| for file in files: | |
| file_extension = file.filename.split('.')[-1].lower() | |
| if file_extension not in self.allowed_files: | |
| return {"error": "unsupported file type uploaded"}, 400 | |
| try: | |
| if file_extension == 'txt' or file_extension=="md": | |
| text = self._read_txt(file) | |
| elif file_extension == 'pdf': | |
| text = self._read_pdf(file) | |
| elif file_extension == 'docx': | |
| text = self._read_docx(file) | |
| elif file_extension == 'json': | |
| text = self._read_json(file) | |
| elif file_extension == 'csv': | |
| text = self._read_csv(file) | |
| elif file_extension in ['xlsx', 'xls']: | |
| text = self._read_excel(file) | |
| if(len(text)>self.max_chars_per_file): | |
| return {"error": "max 5 million characters per file allowed."} , 400 | |
| clean_contents.append({ | |
| "type": file_extension, | |
| "content": text, | |
| "name": file.filename, | |
| "id": str(uuid.uuid4()), | |
| "total_chars": len(text) | |
| }) | |
| total_chars += len(text) | |
| if(total_chars>int(allowed_chars)): | |
| return {"error": "Total allowed characters limit reached"}, 400 | |
| except Exception as e: | |
| return {"error": f"Error reading file {file.filename}: {e}"}, 500 | |
| return {"total_chars": total_chars, "clean_contents": clean_contents}, 200 | |
| def _read_txt(self, file): | |
| file_content = file.read().decode("utf-8") | |
| return self._clean_text(file_content) | |
| def _read_pdf(self, file): | |
| with pdfplumber.open(file) as pdf: | |
| text = '' | |
| for page in pdf.pages: | |
| text += page.extract_text() or '' | |
| return self._clean_text(text) | |
| def _read_docx(self, file): | |
| doc = Document(file) | |
| text = '' | |
| for para in doc.paragraphs: | |
| text += para.text + "\n" | |
| return self._clean_text(text) | |
| def _read_json(self, file): | |
| content = json.load(file) | |
| text = json.dumps(content, ensure_ascii=False) | |
| return self._clean_text(text) | |
| def _read_csv(self, file): | |
| df = pd.read_csv(file) | |
| text = df.to_string(index=False) | |
| return self._clean_text(text) | |
| def _read_excel(self, file): | |
| wb = load_workbook(file) | |
| text = '' | |
| for sheet in wb.sheetnames: | |
| ws = wb[sheet] | |
| for row in ws.iter_rows(values_only=True): | |
| text += ' | '.join(str(cell) if cell is not None else '' for cell in row) + "\n" | |
| return self._clean_text(text) | |
| def _clean_text(self, text): | |
| text = re.sub(r'\s+', ' ', text) | |
| text = re.sub(r'[^\x00-\x7F]+', '', text) | |
| text = text.strip() | |
| return text | |
| file_reader = FileReader() |