Spaces:

Al1Abdullah
/

AI_Chatbot_File_Web_Image_Audio

Sleeping

App Files Files Community

Ali Abdullah commited on Jun 26, 2025

Commit

f235c24

verified ·

1 Parent(s): 014e0ec

Update main.py

Browse files

Files changed (1) hide show

main.py +52 -53

main.py CHANGED Viewed

@@ -3,66 +3,63 @@ from fastapi.responses import JSONResponse
 from pydantic import BaseModel
 from groq import Groq
 from langchain_community.document_loaders import WebBaseLoader
-import os
-import io
-from dotenv import load_dotenv
 from PIL import Image
 import pytesseract
 import whisper
-# Load environment variables
-load_dotenv()
-# Tesseract path
-pytesseract.pytesseract.tesseract_cmd = os.getenv("TESSERACT_CMD", "/usr/bin/tesseract")
-# FFmpeg path (for local development only)
-ffmpeg_path = os.getenv("FFMPEG_PATH", "/usr/bin")
-os.environ["PATH"] += os.pathsep + ffmpeg_path
-# File reading libraries
-from docx import Document
-import pandas as pd
-import PyPDF2
 app = FastAPI()
-# Use Groq API key from secrets
-client = Groq(api_key=os.getenv("GROQ_API_KEY"))
 UPLOAD_DIR = "uploaded_files"
 os.makedirs(UPLOAD_DIR, exist_ok=True)
-MAX_FILE_SIZE_MB = 10
-# ---------- File Text Extraction ----------
 def extract_text_from_file(file_path):
     ext = os.path.splitext(file_path)[-1].lower()
-    if ext == ".txt":
-        with open(file_path, "r", encoding="utf-8") as f:
-            return f.read()
-    elif ext == ".docx":
-        doc = Document(file_path)
-        return "\n".join([para.text for para in doc.paragraphs])
-    elif ext == ".csv":
-        df = pd.read_csv(file_path)
-        return df.to_string(index=False)
-    elif ext == ".pdf":
-        with open(file_path, "rb") as f:
-            reader = PyPDF2.PdfReader(f)
-            return "\n".join([page.extract_text() for page in reader.pages if page.extract_text()])
-    else:
-        return "❌ Unsupported file type."
-# ---------- Chat with File ----------
 @app.post("/chat-with-file")
 async def chat_with_file(file: UploadFile = File(...), question: str = Form(...)):
     try:
         contents = await file.read()
-        if len(contents) > MAX_FILE_SIZE_MB * 1024 * 1024:
-            return JSONResponse(status_code=400, content={"error": "❌ File too large. Max size is 10MB."})
         file_path = os.path.join(UPLOAD_DIR, file.filename)
         with open(file_path, "wb") as f:
@@ -77,12 +74,12 @@ async def chat_with_file(file: UploadFile = File(...), question: str = Form(...)
                 {"role": "user", "content": f"{file_content}\n\nQuestion: {question}"}
             ]
         )
         return {"answer": response.choices[0].message.content}
     except Exception as e:
-        return JSONResponse(status_code=500, content={"error": str(e)})
-# ---------- Chat with URL ----------
 class URLQuery(BaseModel):
     url: str
     question: str
@@ -90,7 +87,7 @@ class URLQuery(BaseModel):
 @app.post("/chat-with-url")
 async def chat_with_url(data: URLQuery):
     try:
-        os.environ["USER_AGENT"] = "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"
         loader = WebBaseLoader(data.url)
         documents = loader.load()
         web_content = "\n".join([doc.page_content for doc in documents])
@@ -103,10 +100,11 @@ async def chat_with_url(data: URLQuery):
             ]
         )
         return {"answer": response.choices[0].message.content}
     except Exception as e:
-        return JSONResponse(status_code=500, content={"error": str(e)})
-# ---------- Extract Text from Image ----------
 @app.post("/extract-text-from-image")
 async def extract_text_from_image(file: UploadFile = File(...)):
     try:
@@ -115,9 +113,9 @@ async def extract_text_from_image(file: UploadFile = File(...)):
         text = pytesseract.image_to_string(image)
         return {"answer": text.strip() or "⚠️ No text extracted."}
     except Exception as e:
-        return JSONResponse(status_code=500, content={"error": str(e)})
-# ---------- Transcribe Audio ----------
 @app.post("/transcribe-audio")
 async def transcribe_audio(file: UploadFile = File(...)):
     try:
@@ -128,6 +126,7 @@ async def transcribe_audio(file: UploadFile = File(...)):
         model = whisper.load_model("base")
         result = model.transcribe(audio_path)
-        return {"answer": result["text"] if result.get("text") else "⚠️ No transcript returned."}
     except Exception as e:
-        return JSONResponse(status_code=500, content={"error": str(e)})

 from pydantic import BaseModel
 from groq import Groq
 from langchain_community.document_loaders import WebBaseLoader
+from docx import Document
+import pandas as pd
+import PyPDF2
 from PIL import Image
 import pytesseract
 import whisper
+import os
+import io
+# === ENVIRONMENT SETUP ===
+# Hugging Face Spaces inject secrets automatically
+GROQ_API_KEY = os.getenv("GROQ_API_KEY")
+TESSERACT_CMD = os.getenv("TESSERACT_CMD", "/usr/bin/tesseract")
+FFMPEG_PATH = os.getenv("FFMPEG_PATH", "/usr/bin")
+# Ensure paths
+pytesseract.pytesseract.tesseract_cmd = TESSERACT_CMD
+os.environ["PATH"] += os.pathsep + FFMPEG_PATH
+# === FastAPI APP INIT ===
 app = FastAPI()
 UPLOAD_DIR = "uploaded_files"
 os.makedirs(UPLOAD_DIR, exist_ok=True)
+# === Groq API Init ===
+if not GROQ_API_KEY:
+    raise ValueError("GROQ_API_KEY not found in environment.")
+client = Groq(api_key=GROQ_API_KEY)
+# === Extract text from file ===
 def extract_text_from_file(file_path):
     ext = os.path.splitext(file_path)[-1].lower()
+    try:
+        if ext == ".txt":
+            with open(file_path, "r", encoding="utf-8") as f:
+                return f.read()
+        elif ext == ".docx":
+            return "\n".join([p.text for p in Document(file_path).paragraphs])
+        elif ext == ".csv":
+            df = pd.read_csv(file_path)
+            return df.to_string(index=False)
+        elif ext == ".pdf":
+            with open(file_path, "rb") as f:
+                reader = PyPDF2.PdfReader(f)
+                return "\n".join([p.extract_text() for p in reader.pages if p.extract_text()])
+        else:
+            return "❌ Unsupported file type."
+    except Exception as e:
+        return f"❌ Failed to read file: {str(e)}"
+# === Endpoint: Chat with File ===
 @app.post("/chat-with-file")
 async def chat_with_file(file: UploadFile = File(...), question: str = Form(...)):
     try:
         contents = await file.read()
+        if len(contents) > 10 * 1024 * 1024:
+            return JSONResponse(status_code=400, content={"error": "❌ File too large (max 10MB)."})
         file_path = os.path.join(UPLOAD_DIR, file.filename)
         with open(file_path, "wb") as f:
                 {"role": "user", "content": f"{file_content}\n\nQuestion: {question}"}
             ]
         )
         return {"answer": response.choices[0].message.content}
     except Exception as e:
+        return JSONResponse(status_code=500, content={"error": f"❌ {str(e)}"})
+# === Endpoint: Chat with URL ===
 class URLQuery(BaseModel):
     url: str
     question: str
 @app.post("/chat-with-url")
 async def chat_with_url(data: URLQuery):
     try:
+        os.environ["USER_AGENT"] = "Mozilla/5.0"
         loader = WebBaseLoader(data.url)
         documents = loader.load()
         web_content = "\n".join([doc.page_content for doc in documents])
             ]
         )
         return {"answer": response.choices[0].message.content}
     except Exception as e:
+        return JSONResponse(status_code=500, content={"error": f"❌ {str(e)}"})
+# === Endpoint: OCR from Image ===
 @app.post("/extract-text-from-image")
 async def extract_text_from_image(file: UploadFile = File(...)):
     try:
         text = pytesseract.image_to_string(image)
         return {"answer": text.strip() or "⚠️ No text extracted."}
     except Exception as e:
+        return JSONResponse(status_code=500, content={"error": f"❌ {str(e)}"})
+# === Endpoint: Transcribe Audio ===
 @app.post("/transcribe-audio")
 async def transcribe_audio(file: UploadFile = File(...)):
     try:
         model = whisper.load_model("base")
         result = model.transcribe(audio_path)
+        return {"answer": result.get("text", "⚠️ No transcript returned.")}
     except Exception as e:
+        return JSONResponse(status_code=500, content={"error": f"❌ {str(e)}"})