jmparejaz commited on
Commit
7e04d95
ยท
verified ยท
1 Parent(s): 2b593e9

fix: use huggingface/spaces base image with Tesseract

Browse files
Files changed (4) hide show
  1. Dockerfile +19 -0
  2. README.md +1 -3
  3. check_space.py +21 -0
  4. delete_pdfs.py +35 -0
Dockerfile ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM huggingface/spaces:latest
2
+
3
+ # Install Tesseract OCR
4
+ RUN apt-get update && apt-get install -y \
5
+ tesseract-ocr \
6
+ tesseract-ocr-spa \
7
+ && rm -rf /var/lib/apt/lists/*
8
+
9
+ # Install Python dependencies
10
+ COPY requirements.txt .
11
+ RUN pip install --no-cache-dir -r requirements.txt
12
+
13
+ # Copy app
14
+ COPY app.py .
15
+
16
+ ENV GRADIO_SERVER_NAME=0.0.0.0
17
+ ENV GRADIO_SERVER_PORT=7860
18
+
19
+ CMD ["python", "app.py"]
README.md CHANGED
@@ -3,9 +3,7 @@ title: LayoutLMv3 Document Classifier
3
  emoji: ๐Ÿ“„
4
  colorFrom: blue
5
  colorTo: indigo
6
- sdk: gradio
7
- sdk_version: 5.0.0
8
- app_file: app.py
9
  pinned: false
10
  license: apache-2.0
11
  short_description: Clasificador de documentos legales con LayoutLMv3
 
3
  emoji: ๐Ÿ“„
4
  colorFrom: blue
5
  colorTo: indigo
6
+ sdk: docker
 
 
7
  pinned: false
8
  license: apache-2.0
9
  short_description: Clasificador de documentos legales con LayoutLMv3
check_space.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from playwright.sync_api import sync_playwright
2
+
3
+ with sync_playwright() as p:
4
+ browser = p.chromium.launch(headless=True)
5
+ page = browser.new_page()
6
+
7
+ try:
8
+ page.goto("https://huggingface.co/spaces/jmparejaz/documents_classifyer", timeout=60000, wait_until="domcontentloaded")
9
+ page.wait_for_timeout(3000)
10
+
11
+ print("Title:", page.title())
12
+
13
+ # Get page content
14
+ body_text = page.locator("body").inner_text()
15
+ print("\n--- Page Content ---")
16
+ print(body_text[:1500])
17
+
18
+ except Exception as e:
19
+ print(f"Error: {e}")
20
+
21
+ browser.close()
delete_pdfs.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from huggingface_hub import HfApi
2
+
3
+ api = HfApi()
4
+ repo_id = "jmparejaz/documents_classifyer"
5
+
6
+ files_to_delete = [
7
+ "EP - Grado de Serano, Maria - S&I.pdf",
8
+ "EP - Hernandez Portillo, Manuel - S&I.pdf",
9
+ "EP - Ibarra, Guadalupe - S&I.pdf",
10
+ "EP - Mendoza Reyes, Faustino - S&I.pdf",
11
+ "MC - Garza, Anna - S&I Notes.pdf",
12
+ "MC - Garza, Anna - S&I SOCP.pdf",
13
+ "MC - Garza, Anna - S&I.pdf",
14
+ "MC - Junez, Juan Jr - S&I.pdf",
15
+ "MC - Marron, Maria - S&I Notes.pdf",
16
+ "MC - Marron, Maria - S&I SOCP.pdf",
17
+ "MC - Marron, Maria - S&I.pdf",
18
+ "SA - Banda, Nicolas - S&I.pdf",
19
+ "SA - Benavidez, Ysidro - S&I Demos.pdf",
20
+ "SA - Benavidez, Ysidro - S&I Notes.pdf",
21
+ "SA - Benavidez, Ysidro - S&I.pdf",
22
+ "SA - Fernandez, Hector - S&I.pdf",
23
+ "SA - Heath, Nelda - S&I Notes.pdf",
24
+ "SA - Heath, Nelda - S&I.pdf",
25
+ "SA - Valdez, Rogelio - S&I Demos.pdf",
26
+ "SA - Valdez, Rogelio - S&I Notes.pdf",
27
+ "SA - Valdez, Rogelio - S&I.pdf"
28
+ ]
29
+
30
+ for f in files_to_delete:
31
+ try:
32
+ api.delete_file(f, repo_id, repo_type="space")
33
+ print(f"Deleted: {f}")
34
+ except Exception as e:
35
+ print(f"Error deleting {f}: {e}")