from fastapi import FastAPI, File, UploadFile, Form, HTTPException from fastapi.middleware.cors import CORSMiddleware from fastapi.responses import HTMLResponse from fastapi.staticfiles import StaticFiles import pytesseract from PIL import Image import PyPDF2 from transformers import pipeline import pytesseract from PIL import Image import PyPDF2 import docx from pptx import Presentation import pandas as pd def extract_text_from_file(file): file_extension = file.filename.split(".")[-1].lower() if file_extension == "pdf": reader = PyPDF2.PdfFileReader(file.file) text = "" for page_num in range(reader.numPages): text += reader.getPage(page_num).extract_text() return text elif file_extension == "docx": doc = docx.Document(file.file) return "\n".join([para.text for para in doc.paragraphs]) elif file_extension == "pptx": prs = Presentation(file.file) text = "" for slide in prs.slides: for shape in slide.shapes: if hasattr(shape, "text"): text += shape.text + "\n" return text elif file_extension == "xlsx": df = pd.read_excel(file.file) return df.to_string() else: return "" def extract_text_from_image(image): img = Image.open(image.file) return pytesseract.image_to_string(img)