Extract PDF content (from CHP) with Python
import re import sys import urllib.request import pdfplumber import pandas as pd if (len(sys.argv) < 2): print("\n\nSyntax: python extract_pdf.py date_string (e.g. 20220124) \n\n") exit() def main(): date_string = sys.argv[1] # e.g. 20220124 pdf_file_name = f"ctn_{date_string}.pdf" # PDF from CHP: https://www.chp.gov.hk/files/pdf/ctn_20220124.pdf #download_pdf( pdf_file_name ) extract_pdf( pdf_file_name ) def extract_pdf( pdf_file_name ): with pdfplumber.open( f"./pdf/{pdf_file_name}" ) as pdf: for page in pdf.pages: #print(page) for table in page.extract_tables(): df = pd.DataFrame(table[1:], columns=table[0]) for index, row in df.iterrows():...