Spaces:

anasmkh
/

QdrantVectorStore_Llamaindex

Sleeping

App Files Files Community

anasmkh commited on Feb 12, 2025

Commit

769d8f2

verified ·

1 Parent(s): 5f94cf5

Update app.py

Browse files

Files changed (1) hide show

app.py +76 -3

app.py CHANGED Viewed

@@ -76,25 +76,98 @@ def chat_with_ai(user_input, chat_history):
 def clear_history():
     return [], ""
 def upload_file(file):
     if file is None:
         return "No file uploaded!"
     if isinstance(file, list):
         file = file[0]
     if hasattr(file, 'name'):
         file_name = file.name
     elif isinstance(file, dict):
         file_name = file.get("name", "uploaded_file")
     else:
-        file_name = "uploaded_file"
     if not os.path.exists("new_file"):
         os.makedirs("new_file")
     file_path = os.path.join("new_file", file_name)
     if hasattr(file, "read"):
         content = file.read()

 def clear_history():
     return [], ""
+import os
+import PyPDF2
+import docx
+import pandas as pd
+def extract_text_from_file(file_path):
+    """
+    Extracts text from the file based on its extension.
+    Supports: PDF, DOC/DOCX, TXT, XLS/XLSX.
+    """
+    ext = os.path.splitext(file_path)[1].lower()
+    text = ""
+    if ext == ".pdf":
+        try:
+            with open(file_path, "rb") as f:
+                pdf_reader = PyPDF2.PdfReader(f)
+                for page in pdf_reader.pages:
+                    page_text = page.extract_text()
+                    if page_text:
+                        text += page_text + "\n"
+        except Exception as e:
+            text = f"Error processing PDF: {e}"
+    elif ext in [".doc", ".docx"]:
+        try:
+            doc = docx.Document(file_path)
+            text = "\n".join([para.text for para in doc.paragraphs])
+        except Exception as e:
+            text = f"Error processing Word document: {e}"
+    elif ext == ".txt":
+        try:
+            with open(file_path, "r", encoding="utf-8") as f:
+                text = f.read()
+        except Exception as e:
+            text = f"Error processing TXT file: {e}"
+    elif ext in [".xls", ".xlsx"]:
+        try:
+            # Read the first sheet of the Excel file
+            df = pd.read_excel(file_path)
+            # Convert the dataframe to CSV format (or any format you prefer)
+            text = df.to_csv(index=False)
+        except Exception as e:
+            text = f"Error processing Excel file: {e}"
+    else:
+        text = "Unsupported file type for text extraction."
+    return text
 def upload_file(file):
     if file is None:
         return "No file uploaded!"
     if isinstance(file, list):
         file = file[0]
     if hasattr(file, 'name'):
         file_name = file.name
+        file_data = file.read()
     elif isinstance(file, dict):
         file_name = file.get("name", "uploaded_file")
+        file_data = file.get("data")
     else:
+        return "Uploaded file format not recognized."
+    if file_data is None:
+        return "Uploaded file data not found!"
     if not os.path.exists("new_file"):
         os.makedirs("new_file")
+    file_path = os.path.join("new_file", file_name)
+    try:
+        with open(file_path, "wb") as f:
+            f.write(file_data)
+    except Exception as e:
+        return f"Error saving file: {e}"
+    extracted_text = extract_text_from_file(file_path)
+    preview = extracted_text[:200] + "..." if len(extracted_text) > 200 else extracted_text
+    return f"File {file_name} uploaded and processed successfully!\nExtracted text preview:\n{preview}"
     file_path = os.path.join("new_file", file_name)
     if hasattr(file, "read"):
         content = file.read()