yousbek's picture
Upload 8 files
78808e7 verified
from fastapi import FastAPI, File, UploadFile, Form, HTTPException
from fastapi.middleware.cors import CORSMiddleware
from fastapi.responses import HTMLResponse
from fastapi.staticfiles import StaticFiles
import pytesseract
from PIL import Image
import PyPDF2
from transformers import pipeline
import pytesseract
from PIL import Image
import PyPDF2
import docx
from pptx import Presentation
import pandas as pd
def extract_text_from_file(file):
file_extension = file.filename.split(".")[-1].lower()
if file_extension == "pdf":
reader = PyPDF2.PdfFileReader(file.file)
text = ""
for page_num in range(reader.numPages):
text += reader.getPage(page_num).extract_text()
return text
elif file_extension == "docx":
doc = docx.Document(file.file)
return "\n".join([para.text for para in doc.paragraphs])
elif file_extension == "pptx":
prs = Presentation(file.file)
text = ""
for slide in prs.slides:
for shape in slide.shapes:
if hasattr(shape, "text"):
text += shape.text + "\n"
return text
elif file_extension == "xlsx":
df = pd.read_excel(file.file)
return df.to_string()
else:
return ""
def extract_text_from_image(image):
img = Image.open(image.file)
return pytesseract.image_to_string(img)