Ali Abdullah commited on
Commit
1843c3a
·
verified ·
1 Parent(s): 15dc739

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +41 -45
main.py CHANGED
@@ -10,34 +10,37 @@ from dotenv import load_dotenv
10
  from PIL import Image
11
  import pytesseract
12
  import whisper
13
- import traceback # for detailed error logging
14
 
15
- # Load environment variables
16
- load_dotenv()
17
-
18
- # Tesseract path
19
- pytesseract.pytesseract.tesseract_cmd = os.getenv("TESSERACT_CMD", "/usr/bin/tesseract")
20
-
21
- # FFmpeg path
22
- ffmpeg_path = os.getenv("FFMPEG_PATH", "/usr/bin")
23
- os.environ["PATH"] += os.pathsep + ffmpeg_path
24
-
25
- # File reading
26
  from docx import Document
27
  import pandas as pd
28
  import PyPDF2
29
 
 
 
 
 
30
  app = FastAPI()
31
 
32
- # Groq client
33
- client = Groq(api_key=os.getenv("GROQ_API_KEY"))
 
 
34
 
 
35
  UPLOAD_DIR = "uploaded_files"
36
  os.makedirs(UPLOAD_DIR, exist_ok=True)
37
 
 
 
 
 
38
  MAX_FILE_SIZE_MB = 10
39
 
40
- # --- Helper: Extract Text from File ---
 
 
 
 
41
  def extract_text_from_file(file_path):
42
  ext = os.path.splitext(file_path)[-1].lower()
43
  try:
@@ -46,27 +49,24 @@ def extract_text_from_file(file_path):
46
  return f.read()
47
  elif ext == ".docx":
48
  doc = Document(file_path)
49
- return "\n".join([para.text for para in doc.paragraphs])
50
  elif ext == ".csv":
51
  df = pd.read_csv(file_path)
52
  return df.to_string(index=False)
53
  elif ext == ".pdf":
54
  with open(file_path, "rb") as f:
55
  reader = PyPDF2.PdfReader(f)
56
- return "\n".join([page.extract_text() for page in reader.pages if page.extract_text()])
57
  else:
58
  return "❌ Unsupported file type."
59
  except Exception as e:
60
- print(" File parsing error:", str(e))
61
- traceback.print_exc()
62
- return "⚠️ Failed to extract text."
63
 
64
- # --- Endpoint: Chat with File ---
65
  @app.post("/chat-with-file")
66
  async def chat_with_file(file: UploadFile = File(...), question: str = Form(...)):
67
  try:
68
  contents = await file.read()
69
-
70
  if len(contents) > MAX_FILE_SIZE_MB * 1024 * 1024:
71
  return JSONResponse(status_code=400, content={"error": "❌ File too large. Max size is 10MB."})
72
 
@@ -75,61 +75,58 @@ async def chat_with_file(file: UploadFile = File(...), question: str = Form(...)
75
  f.write(contents)
76
 
77
  file_content = extract_text_from_file(file_path)
 
78
 
79
  response = client.chat.completions.create(
80
  model="llama3-8b-8192",
81
  messages=[
82
  {"role": "system", "content": "You are a helpful assistant. Use the uploaded file content to answer questions."},
83
- {"role": "user", "content": f"{file_content}\n\nQuestion: {question}"}
84
  ]
85
  )
86
-
87
- return {"answer": response.choices[0].message.content}
88
  except Exception as e:
89
- print("❌ Chat with file error:", str(e))
90
- traceback.print_exc()
91
- return JSONResponse(status_code=500, content={"error": str(e)})
92
 
93
- # --- Endpoint: Chat with URL ---
94
  class URLQuery(BaseModel):
95
  url: str
96
  question: str
97
 
 
98
  @app.post("/chat-with-url")
99
  async def chat_with_url(data: URLQuery):
100
  try:
101
- os.environ["USER_AGENT"] = "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"
102
  loader = WebBaseLoader(data.url)
103
  documents = loader.load()
104
  web_content = "\n".join([doc.page_content for doc in documents])
105
 
 
 
106
  response = client.chat.completions.create(
107
  model="llama3-8b-8192",
108
  messages=[
109
  {"role": "system", "content": "You are a helpful assistant. Use the website content to answer the user's question."},
110
- {"role": "user", "content": f"Website Content:\n{web_content}\n\nNow answer this question:\n{data.question}"}
111
  ]
112
  )
113
- return {"answer": response.choices[0].message.content}
114
  except Exception as e:
115
- print("❌ Chat with URL error:", str(e))
116
- traceback.print_exc()
117
- return JSONResponse(status_code=500, content={"error": str(e)})
118
 
119
- # --- Endpoint: OCR Image ---
120
  @app.post("/extract-text-from-image")
121
  async def extract_text_from_image(file: UploadFile = File(...)):
122
  try:
123
  contents = await file.read()
124
  image = Image.open(io.BytesIO(contents))
125
  text = pytesseract.image_to_string(image)
126
- return {"answer": text.strip() or "⚠️ No text extracted."}
127
  except Exception as e:
128
- print("❌ Image OCR error:", str(e))
129
- traceback.print_exc()
130
- return JSONResponse(status_code=500, content={"error": str(e)})
131
 
132
- # --- Endpoint: Transcribe Audio ---
133
  @app.post("/transcribe-audio")
134
  async def transcribe_audio(file: UploadFile = File(...)):
135
  try:
@@ -140,8 +137,7 @@ async def transcribe_audio(file: UploadFile = File(...)):
140
 
141
  model = whisper.load_model("base")
142
  result = model.transcribe(audio_path)
143
- return {"answer": result.get("text", "⚠️ No transcript returned.")}
 
144
  except Exception as e:
145
- print("❌ Audio transcription error:", str(e))
146
- traceback.print_exc()
147
- return JSONResponse(status_code=500, content={"error": str(e)})
 
10
  from PIL import Image
11
  import pytesseract
12
  import whisper
 
13
 
 
 
 
 
 
 
 
 
 
 
 
14
  from docx import Document
15
  import pandas as pd
16
  import PyPDF2
17
 
18
+ # Load .env
19
+ load_dotenv()
20
+
21
+ # Initialize FastAPI
22
  app = FastAPI()
23
 
24
+ # Tesseract and FFmpeg setup
25
+ pytesseract.pytesseract.tesseract_cmd = os.getenv("TESSERACT_CMD", "/usr/bin/tesseract")
26
+ ffmpeg_path = os.getenv("FFMPEG_PATH", "/usr/bin")
27
+ os.environ["PATH"] += os.pathsep + ffmpeg_path
28
 
29
+ # Upload directory
30
  UPLOAD_DIR = "uploaded_files"
31
  os.makedirs(UPLOAD_DIR, exist_ok=True)
32
 
33
+ # Groq Client
34
+ client = Groq(api_key=os.getenv("GROQ_API_KEY"))
35
+
36
+ # File limit
37
  MAX_FILE_SIZE_MB = 10
38
 
39
+
40
+ def safe_response(text):
41
+ return text.encode("utf-8", "replace").decode("utf-8")
42
+
43
+
44
  def extract_text_from_file(file_path):
45
  ext = os.path.splitext(file_path)[-1].lower()
46
  try:
 
49
  return f.read()
50
  elif ext == ".docx":
51
  doc = Document(file_path)
52
+ return "\n".join([p.text for p in doc.paragraphs])
53
  elif ext == ".csv":
54
  df = pd.read_csv(file_path)
55
  return df.to_string(index=False)
56
  elif ext == ".pdf":
57
  with open(file_path, "rb") as f:
58
  reader = PyPDF2.PdfReader(f)
59
+ return "\n".join([p.extract_text() for p in reader.pages if p.extract_text()])
60
  else:
61
  return "❌ Unsupported file type."
62
  except Exception as e:
63
+ return f"⚠️ Error while reading file: {str(e)}"
64
+
 
65
 
 
66
  @app.post("/chat-with-file")
67
  async def chat_with_file(file: UploadFile = File(...), question: str = Form(...)):
68
  try:
69
  contents = await file.read()
 
70
  if len(contents) > MAX_FILE_SIZE_MB * 1024 * 1024:
71
  return JSONResponse(status_code=400, content={"error": "❌ File too large. Max size is 10MB."})
72
 
 
75
  f.write(contents)
76
 
77
  file_content = extract_text_from_file(file_path)
78
+ prompt = f"{file_content}\n\nQuestion: {question}"
79
 
80
  response = client.chat.completions.create(
81
  model="llama3-8b-8192",
82
  messages=[
83
  {"role": "system", "content": "You are a helpful assistant. Use the uploaded file content to answer questions."},
84
+ {"role": "user", "content": prompt}
85
  ]
86
  )
87
+ return {"answer": safe_response(response.choices[0].message.content)}
 
88
  except Exception as e:
89
+ return JSONResponse(status_code=500, content={"error": safe_response(str(e))})
90
+
 
91
 
 
92
  class URLQuery(BaseModel):
93
  url: str
94
  question: str
95
 
96
+
97
  @app.post("/chat-with-url")
98
  async def chat_with_url(data: URLQuery):
99
  try:
100
+ os.environ["USER_AGENT"] = "Mozilla/5.0"
101
  loader = WebBaseLoader(data.url)
102
  documents = loader.load()
103
  web_content = "\n".join([doc.page_content for doc in documents])
104
 
105
+ prompt = f"Website Content:\n{web_content}\n\nNow answer this question:\n{data.question}"
106
+
107
  response = client.chat.completions.create(
108
  model="llama3-8b-8192",
109
  messages=[
110
  {"role": "system", "content": "You are a helpful assistant. Use the website content to answer the user's question."},
111
+ {"role": "user", "content": prompt}
112
  ]
113
  )
114
+ return {"answer": safe_response(response.choices[0].message.content)}
115
  except Exception as e:
116
+ return JSONResponse(status_code=500, content={"error": safe_response(str(e))})
117
+
 
118
 
 
119
  @app.post("/extract-text-from-image")
120
  async def extract_text_from_image(file: UploadFile = File(...)):
121
  try:
122
  contents = await file.read()
123
  image = Image.open(io.BytesIO(contents))
124
  text = pytesseract.image_to_string(image)
125
+ return {"answer": safe_response(text.strip() or "⚠️ No text extracted.")}
126
  except Exception as e:
127
+ return JSONResponse(status_code=500, content={"error": safe_response(str(e))})
128
+
 
129
 
 
130
  @app.post("/transcribe-audio")
131
  async def transcribe_audio(file: UploadFile = File(...)):
132
  try:
 
137
 
138
  model = whisper.load_model("base")
139
  result = model.transcribe(audio_path)
140
+ text = result.get("text", "⚠️ No transcript returned.")
141
+ return {"answer": safe_response(text)}
142
  except Exception as e:
143
+ return JSONResponse(status_code=500, content={"error": safe_response(str(e))})