Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -9,6 +9,11 @@ from langchain_google_genai import ChatGoogleGenerativeAI
|
|
| 9 |
from langchain.chains.question_answering import load_qa_chain
|
| 10 |
from langchain.prompts import PromptTemplate
|
| 11 |
from dotenv import load_dotenv
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
|
| 13 |
load_dotenv()
|
| 14 |
os.getenv("GOOGLE_API_KEY")
|
|
@@ -17,12 +22,27 @@ genai.configure(api_key=os.getenv("GOOGLE_API_KEY"))
|
|
| 17 |
|
| 18 |
def get_pdf_text(pdf_docs):
|
| 19 |
text = ""
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 26 |
return text
|
| 27 |
|
| 28 |
|
|
|
|
| 9 |
from langchain.chains.question_answering import load_qa_chain
|
| 10 |
from langchain.prompts import PromptTemplate
|
| 11 |
from dotenv import load_dotenv
|
| 12 |
+
from pdf2image import convert_from_bytes
|
| 13 |
+
from PIL import Image
|
| 14 |
+
import pytesseract
|
| 15 |
+
import io
|
| 16 |
+
|
| 17 |
|
| 18 |
load_dotenv()
|
| 19 |
os.getenv("GOOGLE_API_KEY")
|
|
|
|
| 22 |
|
| 23 |
def get_pdf_text(pdf_docs):
|
| 24 |
text = ""
|
| 25 |
+
|
| 26 |
+
for uploaded_file in pdf_docs:
|
| 27 |
+
if uploaded_file.name.endswith(".pdf"):
|
| 28 |
+
# Process actual PDF files
|
| 29 |
+
pdf_reader = PdfReader(uploaded_file)
|
| 30 |
+
for page in pdf_reader.pages:
|
| 31 |
+
page_text = page.extract_text()
|
| 32 |
+
if page_text:
|
| 33 |
+
text += page_text
|
| 34 |
+
|
| 35 |
+
# If no text extracted, try OCR
|
| 36 |
+
if not text.strip():
|
| 37 |
+
images = convert_from_bytes(uploaded_file.read())
|
| 38 |
+
for img in images:
|
| 39 |
+
text += pytesseract.image_to_string(img)
|
| 40 |
+
|
| 41 |
+
else:
|
| 42 |
+
# Process image files
|
| 43 |
+
image = Image.open(uploaded_file)
|
| 44 |
+
text += pytesseract.image_to_string(image)
|
| 45 |
+
|
| 46 |
return text
|
| 47 |
|
| 48 |
|