Spaces:
Sleeping
Sleeping
| from fastapi import FastAPI, File, UploadFile, Form, HTTPException | |
| from fastapi.middleware.cors import CORSMiddleware | |
| from fastapi.responses import HTMLResponse | |
| from fastapi.staticfiles import StaticFiles | |
| import pytesseract | |
| from PIL import Image | |
| import PyPDF2 | |
| from transformers import pipeline | |
| import pytesseract | |
| from PIL import Image | |
| import PyPDF2 | |
| import docx | |
| from pptx import Presentation | |
| import pandas as pd | |
| def extract_text_from_file(file): | |
| file_extension = file.filename.split(".")[-1].lower() | |
| if file_extension == "pdf": | |
| reader = PyPDF2.PdfFileReader(file.file) | |
| text = "" | |
| for page_num in range(reader.numPages): | |
| text += reader.getPage(page_num).extract_text() | |
| return text | |
| elif file_extension == "docx": | |
| doc = docx.Document(file.file) | |
| return "\n".join([para.text for para in doc.paragraphs]) | |
| elif file_extension == "pptx": | |
| prs = Presentation(file.file) | |
| text = "" | |
| for slide in prs.slides: | |
| for shape in slide.shapes: | |
| if hasattr(shape, "text"): | |
| text += shape.text + "\n" | |
| return text | |
| elif file_extension == "xlsx": | |
| df = pd.read_excel(file.file) | |
| return df.to_string() | |
| else: | |
| return "" | |
| def extract_text_from_image(image): | |
| img = Image.open(image.file) | |
| return pytesseract.image_to_string(img) | |