Ali Abdullah commited on
Commit
735359c
·
verified ·
1 Parent(s): 1843c3a

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +34 -41
main.py CHANGED
@@ -15,32 +15,31 @@ from docx import Document
15
  import pandas as pd
16
  import PyPDF2
17
 
18
- # Load .env
19
  load_dotenv()
20
 
 
 
 
 
21
  # Initialize FastAPI
22
  app = FastAPI()
23
 
24
- # Tesseract and FFmpeg setup
25
- pytesseract.pytesseract.tesseract_cmd = os.getenv("TESSERACT_CMD", "/usr/bin/tesseract")
26
- ffmpeg_path = os.getenv("FFMPEG_PATH", "/usr/bin")
27
- os.environ["PATH"] += os.pathsep + ffmpeg_path
28
 
29
- # Upload directory
30
  UPLOAD_DIR = "uploaded_files"
31
  os.makedirs(UPLOAD_DIR, exist_ok=True)
32
-
33
- # Groq Client
34
- client = Groq(api_key=os.getenv("GROQ_API_KEY"))
35
-
36
- # File limit
37
  MAX_FILE_SIZE_MB = 10
38
 
39
-
40
  def safe_response(text):
41
- return text.encode("utf-8", "replace").decode("utf-8")
42
-
 
 
43
 
 
44
  def extract_text_from_file(file_path):
45
  ext = os.path.splitext(file_path)[-1].lower()
46
  try:
@@ -48,11 +47,9 @@ def extract_text_from_file(file_path):
48
  with open(file_path, "r", encoding="utf-8") as f:
49
  return f.read()
50
  elif ext == ".docx":
51
- doc = Document(file_path)
52
- return "\n".join([p.text for p in doc.paragraphs])
53
  elif ext == ".csv":
54
- df = pd.read_csv(file_path)
55
- return df.to_string(index=False)
56
  elif ext == ".pdf":
57
  with open(file_path, "rb") as f:
58
  reader = PyPDF2.PdfReader(f)
@@ -60,40 +57,39 @@ def extract_text_from_file(file_path):
60
  else:
61
  return "❌ Unsupported file type."
62
  except Exception as e:
63
- return f"⚠️ Error while reading file: {str(e)}"
64
-
65
 
 
66
  @app.post("/chat-with-file")
67
  async def chat_with_file(file: UploadFile = File(...), question: str = Form(...)):
68
  try:
69
  contents = await file.read()
70
  if len(contents) > MAX_FILE_SIZE_MB * 1024 * 1024:
71
- return JSONResponse(status_code=400, content={"error": "❌ File too large. Max size is 10MB."})
72
 
73
  file_path = os.path.join(UPLOAD_DIR, file.filename)
74
  with open(file_path, "wb") as f:
75
  f.write(contents)
76
 
77
  file_content = extract_text_from_file(file_path)
78
- prompt = f"{file_content}\n\nQuestion: {question}"
79
 
80
  response = client.chat.completions.create(
81
  model="llama3-8b-8192",
82
  messages=[
83
- {"role": "system", "content": "You are a helpful assistant. Use the uploaded file content to answer questions."},
84
- {"role": "user", "content": prompt}
85
  ]
86
  )
87
- return {"answer": safe_response(response.choices[0].message.content)}
88
- except Exception as e:
89
- return JSONResponse(status_code=500, content={"error": safe_response(str(e))})
90
 
 
 
 
91
 
 
92
  class URLQuery(BaseModel):
93
  url: str
94
  question: str
95
 
96
-
97
  @app.post("/chat-with-url")
98
  async def chat_with_url(data: URLQuery):
99
  try:
@@ -102,31 +98,29 @@ async def chat_with_url(data: URLQuery):
102
  documents = loader.load()
103
  web_content = "\n".join([doc.page_content for doc in documents])
104
 
105
- prompt = f"Website Content:\n{web_content}\n\nNow answer this question:\n{data.question}"
106
-
107
  response = client.chat.completions.create(
108
  model="llama3-8b-8192",
109
  messages=[
110
- {"role": "system", "content": "You are a helpful assistant. Use the website content to answer the user's question."},
111
- {"role": "user", "content": prompt}
112
  ]
113
  )
114
- return {"answer": safe_response(response.choices[0].message.content)}
115
  except Exception as e:
116
- return JSONResponse(status_code=500, content={"error": safe_response(str(e))})
117
-
118
 
 
119
  @app.post("/extract-text-from-image")
120
  async def extract_text_from_image(file: UploadFile = File(...)):
121
  try:
122
  contents = await file.read()
123
  image = Image.open(io.BytesIO(contents))
124
  text = pytesseract.image_to_string(image)
125
- return {"answer": safe_response(text.strip() or "⚠️ No text extracted.")}
126
  except Exception as e:
127
- return JSONResponse(status_code=500, content={"error": safe_response(str(e))})
128
-
129
 
 
130
  @app.post("/transcribe-audio")
131
  async def transcribe_audio(file: UploadFile = File(...)):
132
  try:
@@ -137,7 +131,6 @@ async def transcribe_audio(file: UploadFile = File(...)):
137
 
138
  model = whisper.load_model("base")
139
  result = model.transcribe(audio_path)
140
- text = result.get("text", "⚠️ No transcript returned.")
141
- return {"answer": safe_response(text)}
142
  except Exception as e:
143
- return JSONResponse(status_code=500, content={"error": safe_response(str(e))})
 
15
  import pandas as pd
16
  import PyPDF2
17
 
18
+ # Load environment variables
19
  load_dotenv()
20
 
21
+ # Paths for OCR and FFmpeg
22
+ pytesseract.pytesseract.tesseract_cmd = os.getenv("TESSERACT_CMD", "/usr/bin/tesseract")
23
+ os.environ["PATH"] += os.pathsep + os.getenv("FFMPEG_PATH", "/usr/bin")
24
+
25
  # Initialize FastAPI
26
  app = FastAPI()
27
 
28
+ # Load Groq client with key from environment
29
+ client = Groq(api_key=os.getenv("GROQ_API_KEY"))
 
 
30
 
 
31
  UPLOAD_DIR = "uploaded_files"
32
  os.makedirs(UPLOAD_DIR, exist_ok=True)
 
 
 
 
 
33
  MAX_FILE_SIZE_MB = 10
34
 
35
+ # --- Helper: Safe Response ---
36
  def safe_response(text):
37
+ try:
38
+ return str(text).encode("utf-8", "replace").decode("utf-8")
39
+ except Exception:
40
+ return "⚠️ Unable to process response."
41
 
42
+ # --- Extract text from uploaded file ---
43
  def extract_text_from_file(file_path):
44
  ext = os.path.splitext(file_path)[-1].lower()
45
  try:
 
47
  with open(file_path, "r", encoding="utf-8") as f:
48
  return f.read()
49
  elif ext == ".docx":
50
+ return "\n".join([p.text for p in Document(file_path).paragraphs])
 
51
  elif ext == ".csv":
52
+ return pd.read_csv(file_path).to_string(index=False)
 
53
  elif ext == ".pdf":
54
  with open(file_path, "rb") as f:
55
  reader = PyPDF2.PdfReader(f)
 
57
  else:
58
  return "❌ Unsupported file type."
59
  except Exception as e:
60
+ return f" Failed to extract file: {e}"
 
61
 
62
+ # --- Endpoint: Chat with File ---
63
  @app.post("/chat-with-file")
64
  async def chat_with_file(file: UploadFile = File(...), question: str = Form(...)):
65
  try:
66
  contents = await file.read()
67
  if len(contents) > MAX_FILE_SIZE_MB * 1024 * 1024:
68
+ return JSONResponse(content={"answer": "❌ File too large. Max 10MB."})
69
 
70
  file_path = os.path.join(UPLOAD_DIR, file.filename)
71
  with open(file_path, "wb") as f:
72
  f.write(contents)
73
 
74
  file_content = extract_text_from_file(file_path)
 
75
 
76
  response = client.chat.completions.create(
77
  model="llama3-8b-8192",
78
  messages=[
79
+ {"role": "system", "content": "You are a helpful assistant. Use the file content to answer questions."},
80
+ {"role": "user", "content": f"{file_content}\n\nQuestion: {question}"}
81
  ]
82
  )
 
 
 
83
 
84
+ return JSONResponse(content={"answer": safe_response(response.choices[0].message.content)})
85
+ except Exception as e:
86
+ return JSONResponse(status_code=500, content={"answer": f"❌ Error: {e}"})
87
 
88
+ # --- Endpoint: Chat with URL ---
89
  class URLQuery(BaseModel):
90
  url: str
91
  question: str
92
 
 
93
  @app.post("/chat-with-url")
94
  async def chat_with_url(data: URLQuery):
95
  try:
 
98
  documents = loader.load()
99
  web_content = "\n".join([doc.page_content for doc in documents])
100
 
 
 
101
  response = client.chat.completions.create(
102
  model="llama3-8b-8192",
103
  messages=[
104
+ {"role": "system", "content": "You are a helpful assistant. Use the website content to answer questions."},
105
+ {"role": "user", "content": f"Website Content:\n{web_content}\n\nNow answer this:\n{data.question}"}
106
  ]
107
  )
108
+ return JSONResponse(content={"answer": safe_response(response.choices[0].message.content)})
109
  except Exception as e:
110
+ return JSONResponse(status_code=500, content={"answer": f"❌ Error: {e}"})
 
111
 
112
+ # --- Endpoint: OCR from Image ---
113
  @app.post("/extract-text-from-image")
114
  async def extract_text_from_image(file: UploadFile = File(...)):
115
  try:
116
  contents = await file.read()
117
  image = Image.open(io.BytesIO(contents))
118
  text = pytesseract.image_to_string(image)
119
+ return JSONResponse(content={"answer": safe_response(text.strip() or "⚠️ No text extracted.")})
120
  except Exception as e:
121
+ return JSONResponse(status_code=500, content={"answer": f"❌ Error: {e}"})
 
122
 
123
+ # --- Endpoint: Transcribe Audio ---
124
  @app.post("/transcribe-audio")
125
  async def transcribe_audio(file: UploadFile = File(...)):
126
  try:
 
131
 
132
  model = whisper.load_model("base")
133
  result = model.transcribe(audio_path)
134
+ return JSONResponse(content={"answer": safe_response(result.get("text", "⚠️ No transcript returned."))})
 
135
  except Exception as e:
136
+ return JSONResponse(status_code=500, content={"answer": f"❌ Error: {e}"})