Spaces:
Build error
Build error
Update app.py
Browse files
app.py
CHANGED
|
@@ -8,35 +8,37 @@ from PIL import Image
|
|
| 8 |
import PyPDF2
|
| 9 |
from io import BytesIO
|
| 10 |
import docx
|
|
|
|
| 11 |
|
| 12 |
# Initialize clients
|
| 13 |
API_KEY = os.environ.get("HF_API_KEY")
|
| 14 |
client = InferenceClient(token=API_KEY)
|
| 15 |
|
| 16 |
-
def process_file(
|
| 17 |
-
"""Handle different file types and extract text"""
|
| 18 |
-
if
|
| 19 |
return ""
|
| 20 |
-
|
| 21 |
# Get file extension
|
| 22 |
-
ext =
|
|
|
|
| 23 |
|
| 24 |
try:
|
| 25 |
-
if ext in ['png', 'jpg', 'jpeg']:
|
| 26 |
# OCR processing for images
|
| 27 |
-
image = Image.open(
|
| 28 |
text = pytesseract.image_to_string(image)
|
| 29 |
return f"IMAGE CONTENT:\n{text}"
|
| 30 |
|
| 31 |
-
elif ext == 'pdf':
|
| 32 |
# PDF text extraction
|
| 33 |
-
pdf_reader = PyPDF2.PdfReader(
|
| 34 |
text = "\n".join([page.extract_text() for page in pdf_reader.pages])
|
| 35 |
return f"PDF CONTENT:\n{text}"
|
| 36 |
|
| 37 |
-
elif ext == 'docx':
|
| 38 |
# Word document processing
|
| 39 |
-
doc = docx.Document(
|
| 40 |
text = "\n".join([para.text for para in doc.paragraphs])
|
| 41 |
return f"DOCUMENT CONTENT:\n{text}"
|
| 42 |
|
|
@@ -47,9 +49,9 @@ def process_file(file):
|
|
| 47 |
print(f"File processing error: {e}")
|
| 48 |
return "Error reading file"
|
| 49 |
|
| 50 |
-
def chat(message, history,
|
| 51 |
# Process uploaded file
|
| 52 |
-
file_content = process_file(
|
| 53 |
|
| 54 |
# Build enhanced prompt
|
| 55 |
full_prompt = f"""
|
|
@@ -57,7 +59,8 @@ def chat(message, history, file):
|
|
| 57 |
|
| 58 |
User Message: {message}
|
| 59 |
|
| 60 |
-
Please respond considering both the message and any attached documents:
|
|
|
|
| 61 |
|
| 62 |
# Configure generation parameters
|
| 63 |
generate_kwargs = dict(
|
|
@@ -77,6 +80,7 @@ def chat(message, history, file):
|
|
| 77 |
|
| 78 |
partial_message = ""
|
| 79 |
for response in stream:
|
|
|
|
| 80 |
if response.token.special:
|
| 81 |
continue
|
| 82 |
partial_message += response.token.text
|
|
@@ -88,8 +92,10 @@ with gr.Blocks(theme="soft") as demo:
|
|
| 88 |
gr.Markdown("Upload images, PDFs, or docs and chat about them!")
|
| 89 |
|
| 90 |
with gr.Row():
|
| 91 |
-
|
|
|
|
| 92 |
|
|
|
|
| 93 |
chatbot = gr.ChatInterface(
|
| 94 |
fn=chat,
|
| 95 |
additional_inputs=[file_input],
|
|
@@ -101,4 +107,3 @@ with gr.Blocks(theme="soft") as demo:
|
|
| 101 |
|
| 102 |
demo.launch()
|
| 103 |
|
| 104 |
-
|
|
|
|
| 8 |
import PyPDF2
|
| 9 |
from io import BytesIO
|
| 10 |
import docx
|
| 11 |
+
import ntpath
|
| 12 |
|
| 13 |
# Initialize clients
|
| 14 |
API_KEY = os.environ.get("HF_API_KEY")
|
| 15 |
client = InferenceClient(token=API_KEY)
|
| 16 |
|
| 17 |
+
def process_file(filepath):
|
| 18 |
+
"""Handle different file types and extract text."""
|
| 19 |
+
if not filepath:
|
| 20 |
return ""
|
| 21 |
+
|
| 22 |
# Get file extension
|
| 23 |
+
# e.g., if filepath = "/somefolder/report.pdf", ext = ".pdf"
|
| 24 |
+
ext = os.path.splitext(filepath)[1].lower()
|
| 25 |
|
| 26 |
try:
|
| 27 |
+
if ext in ['.png', '.jpg', '.jpeg']:
|
| 28 |
# OCR processing for images
|
| 29 |
+
image = Image.open(filepath)
|
| 30 |
text = pytesseract.image_to_string(image)
|
| 31 |
return f"IMAGE CONTENT:\n{text}"
|
| 32 |
|
| 33 |
+
elif ext == '.pdf':
|
| 34 |
# PDF text extraction
|
| 35 |
+
pdf_reader = PyPDF2.PdfReader(filepath)
|
| 36 |
text = "\n".join([page.extract_text() for page in pdf_reader.pages])
|
| 37 |
return f"PDF CONTENT:\n{text}"
|
| 38 |
|
| 39 |
+
elif ext == '.docx':
|
| 40 |
# Word document processing
|
| 41 |
+
doc = docx.Document(filepath)
|
| 42 |
text = "\n".join([para.text for para in doc.paragraphs])
|
| 43 |
return f"DOCUMENT CONTENT:\n{text}"
|
| 44 |
|
|
|
|
| 49 |
print(f"File processing error: {e}")
|
| 50 |
return "Error reading file"
|
| 51 |
|
| 52 |
+
def chat(message, history, filepath):
|
| 53 |
# Process uploaded file
|
| 54 |
+
file_content = process_file(filepath) if filepath else ""
|
| 55 |
|
| 56 |
# Build enhanced prompt
|
| 57 |
full_prompt = f"""
|
|
|
|
| 59 |
|
| 60 |
User Message: {message}
|
| 61 |
|
| 62 |
+
Please respond considering both the message and any attached documents:
|
| 63 |
+
"""
|
| 64 |
|
| 65 |
# Configure generation parameters
|
| 66 |
generate_kwargs = dict(
|
|
|
|
| 80 |
|
| 81 |
partial_message = ""
|
| 82 |
for response in stream:
|
| 83 |
+
# Skip special tokens
|
| 84 |
if response.token.special:
|
| 85 |
continue
|
| 86 |
partial_message += response.token.text
|
|
|
|
| 92 |
gr.Markdown("Upload images, PDFs, or docs and chat about them!")
|
| 93 |
|
| 94 |
with gr.Row():
|
| 95 |
+
# Note the change: type="filepath"
|
| 96 |
+
file_input = gr.File(label="Upload File (PDF/Image/Doc)", type="filepath")
|
| 97 |
|
| 98 |
+
# The ChatInterface should pass the file path to `chat`
|
| 99 |
chatbot = gr.ChatInterface(
|
| 100 |
fn=chat,
|
| 101 |
additional_inputs=[file_input],
|
|
|
|
| 107 |
|
| 108 |
demo.launch()
|
| 109 |
|
|
|