File size: 1,458 Bytes
78808e7
 
 
 
 
c536a32
 
 
78808e7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58

from fastapi import FastAPI, File, UploadFile, Form, HTTPException
from fastapi.middleware.cors import CORSMiddleware
from fastapi.responses import HTMLResponse
from fastapi.staticfiles import StaticFiles
import pytesseract
from PIL import Image
import PyPDF2
from transformers import pipeline

import pytesseract
from PIL import Image
import PyPDF2
import docx
from pptx import Presentation
import pandas as pd






def extract_text_from_file(file):
    file_extension = file.filename.split(".")[-1].lower()

    if file_extension == "pdf":
        reader = PyPDF2.PdfFileReader(file.file)
        text = ""
        for page_num in range(reader.numPages):
            text += reader.getPage(page_num).extract_text()
        return text

    elif file_extension == "docx":
        doc = docx.Document(file.file)
        return "\n".join([para.text for para in doc.paragraphs])

    elif file_extension == "pptx":
        prs = Presentation(file.file)
        text = ""
        for slide in prs.slides:
            for shape in slide.shapes:
                if hasattr(shape, "text"):
                    text += shape.text + "\n"
        return text

    elif file_extension == "xlsx":
        df = pd.read_excel(file.file)
        return df.to_string()

    else:
        return ""


def extract_text_from_image(image):
    img = Image.open(image.file)
    return pytesseract.image_to_string(img)