Ali Abdullah commited on
Commit
b6186d4
·
verified ·
1 Parent(s): 0777ffb

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +30 -46
main.py CHANGED
@@ -15,57 +15,45 @@ from docx import Document
15
  import pandas as pd
16
  import PyPDF2
17
 
18
- # Load environment variables
19
  load_dotenv()
20
 
21
- # Paths for OCR and FFmpeg
22
  pytesseract.pytesseract.tesseract_cmd = os.getenv("TESSERACT_CMD", "/usr/bin/tesseract")
23
- os.environ["PATH"] += os.pathsep + os.getenv("FFMPEG_PATH", "/usr/bin")
 
24
 
25
- # Initialize FastAPI
26
  app = FastAPI()
27
 
28
- # Load Groq client with key from environment
29
  client = Groq(api_key=os.getenv("GROQ_API_KEY"))
30
 
31
  UPLOAD_DIR = "uploaded_files"
32
  os.makedirs(UPLOAD_DIR, exist_ok=True)
33
- MAX_FILE_SIZE_MB = 10
34
 
35
- # --- Helper: Safe Response ---
36
- def safe_response(text):
37
- try:
38
- return str(text).encode("utf-8", "replace").decode("utf-8")
39
- except Exception:
40
- return "⚠️ Unable to process response."
41
 
42
- # --- Extract text from uploaded file ---
43
  def extract_text_from_file(file_path):
44
  ext = os.path.splitext(file_path)[-1].lower()
45
- try:
46
- if ext == ".txt":
47
- with open(file_path, "r", encoding="utf-8") as f:
48
- return f.read()
49
- elif ext == ".docx":
50
- return "\n".join([p.text for p in Document(file_path).paragraphs])
51
- elif ext == ".csv":
52
- return pd.read_csv(file_path).to_string(index=False)
53
- elif ext == ".pdf":
54
- with open(file_path, "rb") as f:
55
- reader = PyPDF2.PdfReader(f)
56
- return "\n".join([p.extract_text() for p in reader.pages if p.extract_text()])
57
- else:
58
- return "❌ Unsupported file type."
59
- except Exception as e:
60
- return f"❌ Failed to extract file: {e}"
61
 
62
- # --- Endpoint: Chat with File ---
63
  @app.post("/chat-with-file")
64
  async def chat_with_file(file: UploadFile = File(...), question: str = Form(...)):
65
  try:
66
  contents = await file.read()
67
  if len(contents) > MAX_FILE_SIZE_MB * 1024 * 1024:
68
- return JSONResponse(content={"answer": "❌ File too large. Max 10MB."})
69
 
70
  file_path = os.path.join(UPLOAD_DIR, file.filename)
71
  with open(file_path, "wb") as f:
@@ -76,16 +64,14 @@ async def chat_with_file(file: UploadFile = File(...), question: str = Form(...)
76
  response = client.chat.completions.create(
77
  model="llama3-8b-8192",
78
  messages=[
79
- {"role": "system", "content": "You are a helpful assistant. Use the file content to answer questions."},
80
  {"role": "user", "content": f"{file_content}\n\nQuestion: {question}"}
81
  ]
82
  )
83
-
84
- return JSONResponse(content={"answer": safe_response(response.choices[0].message.content)})
85
  except Exception as e:
86
- return JSONResponse(status_code=500, content={"answer": f"❌ Error: {e}"})
87
 
88
- # --- Endpoint: Chat with URL ---
89
  class URLQuery(BaseModel):
90
  url: str
91
  question: str
@@ -101,26 +87,24 @@ async def chat_with_url(data: URLQuery):
101
  response = client.chat.completions.create(
102
  model="llama3-8b-8192",
103
  messages=[
104
- {"role": "system", "content": "You are a helpful assistant. Use the website content to answer questions."},
105
- {"role": "user", "content": f"Website Content:\n{web_content}\n\nNow answer this:\n{data.question}"}
106
  ]
107
  )
108
- return JSONResponse(content={"answer": safe_response(response.choices[0].message.content)})
109
  except Exception as e:
110
- return JSONResponse(status_code=500, content={"answer": f"❌ Error: {e}"})
111
 
112
- # --- Endpoint: OCR from Image ---
113
  @app.post("/extract-text-from-image")
114
  async def extract_text_from_image(file: UploadFile = File(...)):
115
  try:
116
  contents = await file.read()
117
  image = Image.open(io.BytesIO(contents))
118
  text = pytesseract.image_to_string(image)
119
- return JSONResponse(content={"answer": safe_response(text.strip() or "⚠️ No text extracted.")})
120
  except Exception as e:
121
- return JSONResponse(status_code=500, content={"answer": f"❌ Error: {e}"})
122
 
123
- # --- Endpoint: Transcribe Audio ---
124
  @app.post("/transcribe-audio")
125
  async def transcribe_audio(file: UploadFile = File(...)):
126
  try:
@@ -131,6 +115,6 @@ async def transcribe_audio(file: UploadFile = File(...)):
131
 
132
  model = whisper.load_model("base")
133
  result = model.transcribe(audio_path)
134
- return JSONResponse(content={"answer": safe_response(result.get("text", "⚠️ No transcript returned."))})
135
  except Exception as e:
136
- return JSONResponse(status_code=500, content={"answer": f"❌ Error: {e}"})
 
15
  import pandas as pd
16
  import PyPDF2
17
 
 
18
  load_dotenv()
19
 
 
20
  pytesseract.pytesseract.tesseract_cmd = os.getenv("TESSERACT_CMD", "/usr/bin/tesseract")
21
+ ffmpeg_path = os.getenv("FFMPEG_PATH", "/usr/bin")
22
+ os.environ["PATH"] += os.pathsep + ffmpeg_path
23
 
 
24
  app = FastAPI()
25
 
 
26
  client = Groq(api_key=os.getenv("GROQ_API_KEY"))
27
 
28
  UPLOAD_DIR = "uploaded_files"
29
  os.makedirs(UPLOAD_DIR, exist_ok=True)
 
30
 
31
+ MAX_FILE_SIZE_MB = 10
 
 
 
 
 
32
 
 
33
  def extract_text_from_file(file_path):
34
  ext = os.path.splitext(file_path)[-1].lower()
35
+ if ext == ".txt":
36
+ with open(file_path, "r", encoding="utf-8") as f:
37
+ return f.read()
38
+ elif ext == ".docx":
39
+ doc = Document(file_path)
40
+ return "\n".join([para.text for para in doc.paragraphs])
41
+ elif ext == ".csv":
42
+ df = pd.read_csv(file_path)
43
+ return df.to_string(index=False)
44
+ elif ext == ".pdf":
45
+ with open(file_path, "rb") as f:
46
+ reader = PyPDF2.PdfReader(f)
47
+ return "\n".join([page.extract_text() for page in reader.pages if page.extract_text()])
48
+ else:
49
+ return "❌ Unsupported file type."
 
50
 
 
51
  @app.post("/chat-with-file")
52
  async def chat_with_file(file: UploadFile = File(...), question: str = Form(...)):
53
  try:
54
  contents = await file.read()
55
  if len(contents) > MAX_FILE_SIZE_MB * 1024 * 1024:
56
+ return JSONResponse(status_code=400, content={"error": "❌ File too large. Max size is 10MB."})
57
 
58
  file_path = os.path.join(UPLOAD_DIR, file.filename)
59
  with open(file_path, "wb") as f:
 
64
  response = client.chat.completions.create(
65
  model="llama3-8b-8192",
66
  messages=[
67
+ {"role": "system", "content": "You are a helpful assistant. Use the uploaded file content to answer questions."},
68
  {"role": "user", "content": f"{file_content}\n\nQuestion: {question}"}
69
  ]
70
  )
71
+ return {"answer": response.choices[0].message.content}
 
72
  except Exception as e:
73
+ return JSONResponse(status_code=500, content={"error": str(e)})
74
 
 
75
  class URLQuery(BaseModel):
76
  url: str
77
  question: str
 
87
  response = client.chat.completions.create(
88
  model="llama3-8b-8192",
89
  messages=[
90
+ {"role": "system", "content": "You are a helpful assistant. Use the website content to answer the user's question."},
91
+ {"role": "user", "content": f"Website Content:\n{web_content}\n\nNow answer this question:\n{data.question}"}
92
  ]
93
  )
94
+ return {"answer": response.choices[0].message.content}
95
  except Exception as e:
96
+ return JSONResponse(status_code=500, content={"error": str(e)})
97
 
 
98
  @app.post("/extract-text-from-image")
99
  async def extract_text_from_image(file: UploadFile = File(...)):
100
  try:
101
  contents = await file.read()
102
  image = Image.open(io.BytesIO(contents))
103
  text = pytesseract.image_to_string(image)
104
+ return {"answer": text.strip() or "⚠️ No text extracted."}
105
  except Exception as e:
106
+ return JSONResponse(status_code=500, content={"error": str(e)})
107
 
 
108
  @app.post("/transcribe-audio")
109
  async def transcribe_audio(file: UploadFile = File(...)):
110
  try:
 
115
 
116
  model = whisper.load_model("base")
117
  result = model.transcribe(audio_path)
118
+ return {"answer": result["text"] if result.get("text") else "⚠️ No transcript returned."}
119
  except Exception as e:
120
+ return JSONResponse(status_code=500, content={"error": str(e)})