Spaces:
Runtime error
Runtime error
| # -*- coding: utf-8 -*- | |
| """Untitled60.ipynb | |
| Automatically generated by Colaboratory. | |
| Original file is located at | |
| https://colab.research.google.com/drive/1if8bXKbgk5lh_oSOZ6Vf6dZ8m7MZ1B-p | |
| """ | |
| import gradio as gr | |
| import pandas as pd | |
| from PyPDF2 import PdfReader | |
| import re | |
| import os | |
| def process_pdf(file): | |
| # Load the PDF | |
| pdf = PdfReader(file.name) | |
| data = [] | |
| # File path regex pattern | |
| file_path_pattern = re.compile(r'.*\/.*\..*$') | |
| # Set a flag to start collecting data | |
| start_collecting = False | |
| # Iterate over all pages in the PDF | |
| for page in pdf.pages: | |
| text = page.extract_text() | |
| lines = text.split('\n') | |
| # Process lines that do not contain 'AM', 'PM', or match the file path pattern | |
| for line in lines: | |
| if "Accounts that follow you in Threads" in line: | |
| start_collecting = True | |
| continue | |
| if start_collecting and line.strip() and 'AM' not in line and 'PM' not in line and not file_path_pattern.match(line): | |
| data.append(line) | |
| # Convert the data into a pandas DataFrame | |
| df = pd.DataFrame(data, columns=['Username']) | |
| # Save DataFrame to Excel | |
| output_file = "Followers_output.xlsx" | |
| df.to_excel(output_file, index=False) | |
| return output_file | |
| # Define the Gradio interface | |
| iface = gr.Interface(fn=process_pdf, | |
| inputs="file", | |
| outputs=gr.outputs.File(label="Output Excel"), | |
| title="PDF threads data to Excel", | |
| description="Hi Humood! Just Upload the PDF file of threads data and get an Excel file with usernames.") | |
| iface.launch() |