Kalpokoch commited on
Commit
6b9a057
·
1 Parent(s): 3094824

second update to app.py along with updates to requirements and packages

Browse files
Files changed (2) hide show
  1. app.py +69 -30
  2. requirements.txt +5 -1
app.py CHANGED
@@ -1,7 +1,13 @@
1
  from fastapi import FastAPI, UploadFile, File, HTTPException
2
- from fastapi.middleware.cors import CORSMiddleware # Important for frontend
3
  import uuid
4
  import logging
 
 
 
 
 
 
5
 
6
  # Configure logging
7
  logging.basicConfig(level=logging.INFO)
@@ -9,11 +15,10 @@ logger = logging.getLogger(__name__)
9
 
10
  app = FastAPI()
11
 
12
- # --- IMPORTANT: CORS Middleware ---
13
- # This allows your Netlify frontend to talk to this backend.
14
  app.add_middleware(
15
  CORSMiddleware,
16
- allow_origins=["*"], # Or specify your Netlify URL for production
17
  allow_credentials=True,
18
  allow_methods=["*"],
19
  allow_headers=["*"],
@@ -23,39 +28,73 @@ app.add_middleware(
23
  SESSION_DATA = {}
24
  logger.info("Session store initialized.")
25
 
26
- @app.get("/")
27
- def read_root():
28
- return {"message": "Universal Data AI is running!"}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
 
30
  @app.post("/upload")
31
  async def upload_file(file: UploadFile = File(...)):
32
  """
33
- Accepts a file, creates a session, and stores the file content in memory.
34
  """
35
  session_id = str(uuid.uuid4())
36
- logger.info(f"New upload received. Creating session_id: {session_id}")
 
 
 
 
 
 
 
 
 
 
 
 
 
37
 
38
- try:
39
- # For now, we'll just store the content and filename.
40
- # Later, this is where you'll call your parsing functions.
41
- content = await file.read()
42
- SESSION_DATA[session_id] = {
43
- "filename": file.filename,
44
- "content_type": file.content_type,
45
- "data": content # In a real app, you'd parse this
46
- }
47
- logger.info(f"File '{file.filename}' stored for session {session_id}")
48
- return {"session_id": session_id, "filename": file.filename}
49
- except Exception as e:
50
- logger.error(f"Error processing file for session {session_id}: {e}")
51
- raise HTTPException(status_code=500, detail="Error processing file.")
52
 
53
- @app.get("/session/{session_id}")
54
- def get_session_data(session_id: str):
55
- """
56
- Retrieves the data stored for a given session.
57
- """
 
 
 
 
 
 
 
 
 
 
58
  if session_id not in SESSION_DATA:
59
  raise HTTPException(status_code=404, detail="Session not found.")
60
- # Return metadata, not the raw data, for privacy
61
- return {"filename": SESSION_DATA[session_id].get("filename")}
 
1
  from fastapi import FastAPI, UploadFile, File, HTTPException
2
+ from fastapi.middleware.cors import CORSMiddleware
3
  import uuid
4
  import logging
5
+ import io
6
+
7
+ # NEW: Import parsing libraries
8
+ import fitz # PyMuPDF
9
+ from PIL import Image
10
+ import pytesseract
11
 
12
  # Configure logging
13
  logging.basicConfig(level=logging.INFO)
 
15
 
16
  app = FastAPI()
17
 
18
+ # CORS Middleware
 
19
  app.add_middleware(
20
  CORSMiddleware,
21
+ allow_origins=["*"],
22
  allow_credentials=True,
23
  allow_methods=["*"],
24
  allow_headers=["*"],
 
28
  SESSION_DATA = {}
29
  logger.info("Session store initialized.")
30
 
31
+ # --- NEW: Parsing Functions ---
32
+
33
+ def parse_pdf(content: bytes) -> str:
34
+ """Extracts text from a PDF file's bytes."""
35
+ try:
36
+ doc = fitz.open(stream=content, filetype="pdf")
37
+ text = ""
38
+ for page in doc:
39
+ text += page.get_text()
40
+ logger.info(f"Successfully parsed PDF, extracted {len(text)} characters.")
41
+ return text
42
+ except Exception as e:
43
+ logger.error(f"PDF parsing failed: {e}")
44
+ return ""
45
+
46
+ def parse_image(content: bytes) -> str:
47
+ """Extracts text from an image file's bytes using OCR."""
48
+ try:
49
+ image = Image.open(io.BytesIO(content))
50
+ text = pytesseract.image_to_string(image)
51
+ logger.info(f"Successfully parsed image, extracted {len(text)} characters.")
52
+ return text
53
+ except Exception as e:
54
+ logger.error(f"Image parsing failed: {e}")
55
+ return ""
56
+
57
+ # --- MODIFIED: The /upload Endpoint ---
58
 
59
  @app.post("/upload")
60
  async def upload_file(file: UploadFile = File(...)):
61
  """
62
+ Accepts a file, detects its type, parses it, and stores the extracted text.
63
  """
64
  session_id = str(uuid.uuid4())
65
+ logger.info(f"New upload '{file.filename}'. Creating session_id: {session_id}")
66
+
67
+ content = await file.read()
68
+ extracted_text = ""
69
+
70
+ # Simple dispatcher based on file's content type
71
+ if file.content_type == "application/pdf":
72
+ extracted_text = parse_pdf(content)
73
+ elif file.content_type and file.content_type.startswith("image/"):
74
+ extracted_text = parse_image(content)
75
+ elif file.content_type == "text/plain":
76
+ extracted_text = content.decode("utf-8")
77
+ else:
78
+ raise HTTPException(status_code=400, detail=f"Unsupported file type: {file.content_type}")
79
 
80
+ if not extracted_text:
81
+ raise HTTPException(status_code=400, detail="Could not extract any text from the file.")
 
 
 
 
 
 
 
 
 
 
 
 
82
 
83
+ # Store the EXTRACTED TEXT, not the raw file content
84
+ SESSION_DATA[session_id] = {
85
+ "filename": file.filename,
86
+ "text": extracted_text
87
+ }
88
+
89
+ return {
90
+ "session_id": session_id,
91
+ "filename": file.filename,
92
+ "chars_extracted": len(extracted_text)
93
+ }
94
+
95
+ # This endpoint is useful for debugging
96
+ @app.get("/session/{session_id}/text")
97
+ def get_session_text(session_id: str):
98
  if session_id not in SESSION_DATA:
99
  raise HTTPException(status_code=404, detail="Session not found.")
100
+ return {"text": SESSION_DATA[session_id].get("text", "")}
 
requirements.txt CHANGED
@@ -1,3 +1,7 @@
1
  fastapi
2
  uvicorn[standard]
3
- python-multipart
 
 
 
 
 
1
  fastapi
2
  uvicorn[standard]
3
+ python-multipart
4
+
5
+ PyMuPDF
6
+ Pillow
7
+ pytesseract