Monday, January 24, 2022

Extract PDF content (from CHP) with Python

import re

import sys

import urllib.request

import pdfplumber

import pandas as pd



if (len(sys.argv) < 2):

    print("\n\nSyntax: python extract_pdf.py date_string (e.g. 20220124) \n\n")

    exit()


def main():    

    date_string  = sys.argv[1] # e.g. 20220124

    pdf_file_name = f"ctn_{date_string}.pdf" # PDF from CHP: https://www.chp.gov.hk/files/pdf/ctn_20220124.pdf

    #download_pdf( pdf_file_name )

    extract_pdf( pdf_file_name )



def extract_pdf( pdf_file_name ):

    with pdfplumber.open( f"./pdf/{pdf_file_name}" ) as pdf:

        for page in pdf.pages:

            #print(page)

            for table in page.extract_tables():

                df = pd.DataFrame(table[1:], columns=table[0])

                for index, row in df.iterrows():

                    if (isinstance(row[0], str) and len(row[0])>0):

                        rowid = row[0].replace(".","")

                        title = row[1].split("\n")

                        for i in range(0,len(title)):                            

                            if (re.search(u'[\u4e00-\u9fff]', title[i]) is None):

                                title[i] = ""

                        print(rowid, "".join(title))


def download_pdf ( pdf_file_name ) :

    pdfFile = urllib.request.urlopen(f"https://www.chp.gov.hk/files/pdf/{pdf_file_name}")

    file = open(f"./pdf/{pdf_file_name}", "wb")

    file.write(pdfFile.read())

    file.close()



main()

No comments:

Post a Comment

CSP on Apache

To add CSP to root if sort of funny. The following will NOT work for most cases !!     <LocationMatch "^/$">        Header s...