Ali Abdullah commited on
Commit
15dc739
·
verified ·
1 Parent(s): f235c24

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +51 -36
main.py CHANGED
@@ -3,36 +3,41 @@ from fastapi.responses import JSONResponse
3
  from pydantic import BaseModel
4
  from groq import Groq
5
  from langchain_community.document_loaders import WebBaseLoader
6
- from docx import Document
7
- import pandas as pd
8
- import PyPDF2
 
9
  from PIL import Image
10
  import pytesseract
11
  import whisper
12
- import os
13
- import io
 
 
 
 
 
14
 
15
- # === ENVIRONMENT SETUP ===
16
- # Hugging Face Spaces inject secrets automatically
17
- GROQ_API_KEY = os.getenv("GROQ_API_KEY")
18
- TESSERACT_CMD = os.getenv("TESSERACT_CMD", "/usr/bin/tesseract")
19
- FFMPEG_PATH = os.getenv("FFMPEG_PATH", "/usr/bin")
20
 
21
- # Ensure paths
22
- pytesseract.pytesseract.tesseract_cmd = TESSERACT_CMD
23
- os.environ["PATH"] += os.pathsep + FFMPEG_PATH
 
24
 
25
- # === FastAPI APP INIT ===
26
  app = FastAPI()
 
 
 
 
27
  UPLOAD_DIR = "uploaded_files"
28
  os.makedirs(UPLOAD_DIR, exist_ok=True)
29
 
30
- # === Groq API Init ===
31
- if not GROQ_API_KEY:
32
- raise ValueError("GROQ_API_KEY not found in environment.")
33
- client = Groq(api_key=GROQ_API_KEY)
34
 
35
- # === Extract text from file ===
36
  def extract_text_from_file(file_path):
37
  ext = os.path.splitext(file_path)[-1].lower()
38
  try:
@@ -40,26 +45,30 @@ def extract_text_from_file(file_path):
40
  with open(file_path, "r", encoding="utf-8") as f:
41
  return f.read()
42
  elif ext == ".docx":
43
- return "\n".join([p.text for p in Document(file_path).paragraphs])
 
44
  elif ext == ".csv":
45
  df = pd.read_csv(file_path)
46
  return df.to_string(index=False)
47
  elif ext == ".pdf":
48
  with open(file_path, "rb") as f:
49
  reader = PyPDF2.PdfReader(f)
50
- return "\n".join([p.extract_text() for p in reader.pages if p.extract_text()])
51
  else:
52
  return "❌ Unsupported file type."
53
  except Exception as e:
54
- return f"❌ Failed to read file: {str(e)}"
 
 
55
 
56
- # === Endpoint: Chat with File ===
57
  @app.post("/chat-with-file")
58
  async def chat_with_file(file: UploadFile = File(...), question: str = Form(...)):
59
  try:
60
  contents = await file.read()
61
- if len(contents) > 10 * 1024 * 1024:
62
- return JSONResponse(status_code=400, content={"error": "❌ File too large (max 10MB)."})
 
63
 
64
  file_path = os.path.join(UPLOAD_DIR, file.filename)
65
  with open(file_path, "wb") as f:
@@ -74,12 +83,14 @@ async def chat_with_file(file: UploadFile = File(...), question: str = Form(...)
74
  {"role": "user", "content": f"{file_content}\n\nQuestion: {question}"}
75
  ]
76
  )
77
- return {"answer": response.choices[0].message.content}
78
 
 
79
  except Exception as e:
80
- return JSONResponse(status_code=500, content={"error": f" {str(e)}"})
 
 
81
 
82
- # === Endpoint: Chat with URL ===
83
  class URLQuery(BaseModel):
84
  url: str
85
  question: str
@@ -87,7 +98,7 @@ class URLQuery(BaseModel):
87
  @app.post("/chat-with-url")
88
  async def chat_with_url(data: URLQuery):
89
  try:
90
- os.environ["USER_AGENT"] = "Mozilla/5.0"
91
  loader = WebBaseLoader(data.url)
92
  documents = loader.load()
93
  web_content = "\n".join([doc.page_content for doc in documents])
@@ -100,11 +111,12 @@ async def chat_with_url(data: URLQuery):
100
  ]
101
  )
102
  return {"answer": response.choices[0].message.content}
103
-
104
  except Exception as e:
105
- return JSONResponse(status_code=500, content={"error": f" {str(e)}"})
 
 
106
 
107
- # === Endpoint: OCR from Image ===
108
  @app.post("/extract-text-from-image")
109
  async def extract_text_from_image(file: UploadFile = File(...)):
110
  try:
@@ -113,9 +125,11 @@ async def extract_text_from_image(file: UploadFile = File(...)):
113
  text = pytesseract.image_to_string(image)
114
  return {"answer": text.strip() or "⚠️ No text extracted."}
115
  except Exception as e:
116
- return JSONResponse(status_code=500, content={"error": f" {str(e)}"})
 
 
117
 
118
- # === Endpoint: Transcribe Audio ===
119
  @app.post("/transcribe-audio")
120
  async def transcribe_audio(file: UploadFile = File(...)):
121
  try:
@@ -127,6 +141,7 @@ async def transcribe_audio(file: UploadFile = File(...)):
127
  model = whisper.load_model("base")
128
  result = model.transcribe(audio_path)
129
  return {"answer": result.get("text", "⚠️ No transcript returned.")}
130
-
131
  except Exception as e:
132
- return JSONResponse(status_code=500, content={"error": f" {str(e)}"})
 
 
 
3
  from pydantic import BaseModel
4
  from groq import Groq
5
  from langchain_community.document_loaders import WebBaseLoader
6
+
7
+ import os
8
+ import io
9
+ from dotenv import load_dotenv
10
  from PIL import Image
11
  import pytesseract
12
  import whisper
13
+ import traceback # for detailed error logging
14
+
15
+ # Load environment variables
16
+ load_dotenv()
17
+
18
+ # Tesseract path
19
+ pytesseract.pytesseract.tesseract_cmd = os.getenv("TESSERACT_CMD", "/usr/bin/tesseract")
20
 
21
+ # FFmpeg path
22
+ ffmpeg_path = os.getenv("FFMPEG_PATH", "/usr/bin")
23
+ os.environ["PATH"] += os.pathsep + ffmpeg_path
 
 
24
 
25
+ # File reading
26
+ from docx import Document
27
+ import pandas as pd
28
+ import PyPDF2
29
 
 
30
  app = FastAPI()
31
+
32
+ # Groq client
33
+ client = Groq(api_key=os.getenv("GROQ_API_KEY"))
34
+
35
  UPLOAD_DIR = "uploaded_files"
36
  os.makedirs(UPLOAD_DIR, exist_ok=True)
37
 
38
+ MAX_FILE_SIZE_MB = 10
 
 
 
39
 
40
+ # --- Helper: Extract Text from File ---
41
  def extract_text_from_file(file_path):
42
  ext = os.path.splitext(file_path)[-1].lower()
43
  try:
 
45
  with open(file_path, "r", encoding="utf-8") as f:
46
  return f.read()
47
  elif ext == ".docx":
48
+ doc = Document(file_path)
49
+ return "\n".join([para.text for para in doc.paragraphs])
50
  elif ext == ".csv":
51
  df = pd.read_csv(file_path)
52
  return df.to_string(index=False)
53
  elif ext == ".pdf":
54
  with open(file_path, "rb") as f:
55
  reader = PyPDF2.PdfReader(f)
56
+ return "\n".join([page.extract_text() for page in reader.pages if page.extract_text()])
57
  else:
58
  return "❌ Unsupported file type."
59
  except Exception as e:
60
+ print("❌ File parsing error:", str(e))
61
+ traceback.print_exc()
62
+ return "⚠️ Failed to extract text."
63
 
64
+ # --- Endpoint: Chat with File ---
65
  @app.post("/chat-with-file")
66
  async def chat_with_file(file: UploadFile = File(...), question: str = Form(...)):
67
  try:
68
  contents = await file.read()
69
+
70
+ if len(contents) > MAX_FILE_SIZE_MB * 1024 * 1024:
71
+ return JSONResponse(status_code=400, content={"error": "❌ File too large. Max size is 10MB."})
72
 
73
  file_path = os.path.join(UPLOAD_DIR, file.filename)
74
  with open(file_path, "wb") as f:
 
83
  {"role": "user", "content": f"{file_content}\n\nQuestion: {question}"}
84
  ]
85
  )
 
86
 
87
+ return {"answer": response.choices[0].message.content}
88
  except Exception as e:
89
+ print("❌ Chat with file error:", str(e))
90
+ traceback.print_exc()
91
+ return JSONResponse(status_code=500, content={"error": str(e)})
92
 
93
+ # --- Endpoint: Chat with URL ---
94
  class URLQuery(BaseModel):
95
  url: str
96
  question: str
 
98
  @app.post("/chat-with-url")
99
  async def chat_with_url(data: URLQuery):
100
  try:
101
+ os.environ["USER_AGENT"] = "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"
102
  loader = WebBaseLoader(data.url)
103
  documents = loader.load()
104
  web_content = "\n".join([doc.page_content for doc in documents])
 
111
  ]
112
  )
113
  return {"answer": response.choices[0].message.content}
 
114
  except Exception as e:
115
+ print("❌ Chat with URL error:", str(e))
116
+ traceback.print_exc()
117
+ return JSONResponse(status_code=500, content={"error": str(e)})
118
 
119
+ # --- Endpoint: OCR Image ---
120
  @app.post("/extract-text-from-image")
121
  async def extract_text_from_image(file: UploadFile = File(...)):
122
  try:
 
125
  text = pytesseract.image_to_string(image)
126
  return {"answer": text.strip() or "⚠️ No text extracted."}
127
  except Exception as e:
128
+ print("❌ Image OCR error:", str(e))
129
+ traceback.print_exc()
130
+ return JSONResponse(status_code=500, content={"error": str(e)})
131
 
132
+ # --- Endpoint: Transcribe Audio ---
133
  @app.post("/transcribe-audio")
134
  async def transcribe_audio(file: UploadFile = File(...)):
135
  try:
 
141
  model = whisper.load_model("base")
142
  result = model.transcribe(audio_path)
143
  return {"answer": result.get("text", "⚠️ No transcript returned.")}
 
144
  except Exception as e:
145
+ print("❌ Audio transcription error:", str(e))
146
+ traceback.print_exc()
147
+ return JSONResponse(status_code=500, content={"error": str(e)})