Ali Abdullah commited on
Commit
f235c24
·
verified ·
1 Parent(s): 014e0ec

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +52 -53
main.py CHANGED
@@ -3,66 +3,63 @@ from fastapi.responses import JSONResponse
3
  from pydantic import BaseModel
4
  from groq import Groq
5
  from langchain_community.document_loaders import WebBaseLoader
6
-
7
- import os
8
- import io
9
- from dotenv import load_dotenv
10
  from PIL import Image
11
  import pytesseract
12
  import whisper
 
 
13
 
14
- # Load environment variables
15
- load_dotenv()
16
-
17
- # Tesseract path
18
- pytesseract.pytesseract.tesseract_cmd = os.getenv("TESSERACT_CMD", "/usr/bin/tesseract")
19
 
20
- # FFmpeg path (for local development only)
21
- ffmpeg_path = os.getenv("FFMPEG_PATH", "/usr/bin")
22
- os.environ["PATH"] += os.pathsep + ffmpeg_path
23
-
24
- # File reading libraries
25
- from docx import Document
26
- import pandas as pd
27
- import PyPDF2
28
 
 
29
  app = FastAPI()
30
-
31
- # Use Groq API key from secrets
32
- client = Groq(api_key=os.getenv("GROQ_API_KEY"))
33
-
34
  UPLOAD_DIR = "uploaded_files"
35
  os.makedirs(UPLOAD_DIR, exist_ok=True)
36
 
37
- MAX_FILE_SIZE_MB = 10
 
 
 
38
 
39
- # ---------- File Text Extraction ----------
40
  def extract_text_from_file(file_path):
41
  ext = os.path.splitext(file_path)[-1].lower()
42
- if ext == ".txt":
43
- with open(file_path, "r", encoding="utf-8") as f:
44
- return f.read()
45
- elif ext == ".docx":
46
- doc = Document(file_path)
47
- return "\n".join([para.text for para in doc.paragraphs])
48
- elif ext == ".csv":
49
- df = pd.read_csv(file_path)
50
- return df.to_string(index=False)
51
- elif ext == ".pdf":
52
- with open(file_path, "rb") as f:
53
- reader = PyPDF2.PdfReader(f)
54
- return "\n".join([page.extract_text() for page in reader.pages if page.extract_text()])
55
- else:
56
- return "❌ Unsupported file type."
57
-
58
- # ---------- Chat with File ----------
 
 
59
  @app.post("/chat-with-file")
60
  async def chat_with_file(file: UploadFile = File(...), question: str = Form(...)):
61
  try:
62
  contents = await file.read()
63
-
64
- if len(contents) > MAX_FILE_SIZE_MB * 1024 * 1024:
65
- return JSONResponse(status_code=400, content={"error": "❌ File too large. Max size is 10MB."})
66
 
67
  file_path = os.path.join(UPLOAD_DIR, file.filename)
68
  with open(file_path, "wb") as f:
@@ -77,12 +74,12 @@ async def chat_with_file(file: UploadFile = File(...), question: str = Form(...)
77
  {"role": "user", "content": f"{file_content}\n\nQuestion: {question}"}
78
  ]
79
  )
80
-
81
  return {"answer": response.choices[0].message.content}
 
82
  except Exception as e:
83
- return JSONResponse(status_code=500, content={"error": str(e)})
84
 
85
- # ---------- Chat with URL ----------
86
  class URLQuery(BaseModel):
87
  url: str
88
  question: str
@@ -90,7 +87,7 @@ class URLQuery(BaseModel):
90
  @app.post("/chat-with-url")
91
  async def chat_with_url(data: URLQuery):
92
  try:
93
- os.environ["USER_AGENT"] = "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"
94
  loader = WebBaseLoader(data.url)
95
  documents = loader.load()
96
  web_content = "\n".join([doc.page_content for doc in documents])
@@ -103,10 +100,11 @@ async def chat_with_url(data: URLQuery):
103
  ]
104
  )
105
  return {"answer": response.choices[0].message.content}
 
106
  except Exception as e:
107
- return JSONResponse(status_code=500, content={"error": str(e)})
108
 
109
- # ---------- Extract Text from Image ----------
110
  @app.post("/extract-text-from-image")
111
  async def extract_text_from_image(file: UploadFile = File(...)):
112
  try:
@@ -115,9 +113,9 @@ async def extract_text_from_image(file: UploadFile = File(...)):
115
  text = pytesseract.image_to_string(image)
116
  return {"answer": text.strip() or "⚠️ No text extracted."}
117
  except Exception as e:
118
- return JSONResponse(status_code=500, content={"error": str(e)})
119
 
120
- # ---------- Transcribe Audio ----------
121
  @app.post("/transcribe-audio")
122
  async def transcribe_audio(file: UploadFile = File(...)):
123
  try:
@@ -128,6 +126,7 @@ async def transcribe_audio(file: UploadFile = File(...)):
128
 
129
  model = whisper.load_model("base")
130
  result = model.transcribe(audio_path)
131
- return {"answer": result["text"] if result.get("text") else "⚠️ No transcript returned."}
 
132
  except Exception as e:
133
- return JSONResponse(status_code=500, content={"error": str(e)})
 
3
  from pydantic import BaseModel
4
  from groq import Groq
5
  from langchain_community.document_loaders import WebBaseLoader
6
+ from docx import Document
7
+ import pandas as pd
8
+ import PyPDF2
 
9
  from PIL import Image
10
  import pytesseract
11
  import whisper
12
+ import os
13
+ import io
14
 
15
+ # === ENVIRONMENT SETUP ===
16
+ # Hugging Face Spaces inject secrets automatically
17
+ GROQ_API_KEY = os.getenv("GROQ_API_KEY")
18
+ TESSERACT_CMD = os.getenv("TESSERACT_CMD", "/usr/bin/tesseract")
19
+ FFMPEG_PATH = os.getenv("FFMPEG_PATH", "/usr/bin")
20
 
21
+ # Ensure paths
22
+ pytesseract.pytesseract.tesseract_cmd = TESSERACT_CMD
23
+ os.environ["PATH"] += os.pathsep + FFMPEG_PATH
 
 
 
 
 
24
 
25
+ # === FastAPI APP INIT ===
26
  app = FastAPI()
 
 
 
 
27
  UPLOAD_DIR = "uploaded_files"
28
  os.makedirs(UPLOAD_DIR, exist_ok=True)
29
 
30
+ # === Groq API Init ===
31
+ if not GROQ_API_KEY:
32
+ raise ValueError("GROQ_API_KEY not found in environment.")
33
+ client = Groq(api_key=GROQ_API_KEY)
34
 
35
+ # === Extract text from file ===
36
  def extract_text_from_file(file_path):
37
  ext = os.path.splitext(file_path)[-1].lower()
38
+ try:
39
+ if ext == ".txt":
40
+ with open(file_path, "r", encoding="utf-8") as f:
41
+ return f.read()
42
+ elif ext == ".docx":
43
+ return "\n".join([p.text for p in Document(file_path).paragraphs])
44
+ elif ext == ".csv":
45
+ df = pd.read_csv(file_path)
46
+ return df.to_string(index=False)
47
+ elif ext == ".pdf":
48
+ with open(file_path, "rb") as f:
49
+ reader = PyPDF2.PdfReader(f)
50
+ return "\n".join([p.extract_text() for p in reader.pages if p.extract_text()])
51
+ else:
52
+ return "❌ Unsupported file type."
53
+ except Exception as e:
54
+ return f"❌ Failed to read file: {str(e)}"
55
+
56
+ # === Endpoint: Chat with File ===
57
  @app.post("/chat-with-file")
58
  async def chat_with_file(file: UploadFile = File(...), question: str = Form(...)):
59
  try:
60
  contents = await file.read()
61
+ if len(contents) > 10 * 1024 * 1024:
62
+ return JSONResponse(status_code=400, content={"error": "❌ File too large (max 10MB)."})
 
63
 
64
  file_path = os.path.join(UPLOAD_DIR, file.filename)
65
  with open(file_path, "wb") as f:
 
74
  {"role": "user", "content": f"{file_content}\n\nQuestion: {question}"}
75
  ]
76
  )
 
77
  return {"answer": response.choices[0].message.content}
78
+
79
  except Exception as e:
80
+ return JSONResponse(status_code=500, content={"error": f"❌ {str(e)}"})
81
 
82
+ # === Endpoint: Chat with URL ===
83
  class URLQuery(BaseModel):
84
  url: str
85
  question: str
 
87
  @app.post("/chat-with-url")
88
  async def chat_with_url(data: URLQuery):
89
  try:
90
+ os.environ["USER_AGENT"] = "Mozilla/5.0"
91
  loader = WebBaseLoader(data.url)
92
  documents = loader.load()
93
  web_content = "\n".join([doc.page_content for doc in documents])
 
100
  ]
101
  )
102
  return {"answer": response.choices[0].message.content}
103
+
104
  except Exception as e:
105
+ return JSONResponse(status_code=500, content={"error": f"❌ {str(e)}"})
106
 
107
+ # === Endpoint: OCR from Image ===
108
  @app.post("/extract-text-from-image")
109
  async def extract_text_from_image(file: UploadFile = File(...)):
110
  try:
 
113
  text = pytesseract.image_to_string(image)
114
  return {"answer": text.strip() or "⚠️ No text extracted."}
115
  except Exception as e:
116
+ return JSONResponse(status_code=500, content={"error": f"❌ {str(e)}"})
117
 
118
+ # === Endpoint: Transcribe Audio ===
119
  @app.post("/transcribe-audio")
120
  async def transcribe_audio(file: UploadFile = File(...)):
121
  try:
 
126
 
127
  model = whisper.load_model("base")
128
  result = model.transcribe(audio_path)
129
+ return {"answer": result.get("text", "⚠️ No transcript returned.")}
130
+
131
  except Exception as e:
132
+ return JSONResponse(status_code=500, content={"error": f"❌ {str(e)}"})