Extract PDF content (from CHP) with Python
import re
import sys
import urllib.request
import pdfplumber
import pandas as pd
if (len(sys.argv) < 2):
print("\n\nSyntax: python extract_pdf.py date_string (e.g. 20220124) \n\n")
exit()
def main():
date_string = sys.argv[1] # e.g. 20220124
pdf_file_name = f"ctn_{date_string}.pdf" # PDF from CHP: https://www.chp.gov.hk/files/pdf/ctn_20220124.pdf
#download_pdf( pdf_file_name )
extract_pdf( pdf_file_name )
def extract_pdf( pdf_file_name ):
with pdfplumber.open( f"./pdf/{pdf_file_name}" ) as pdf:
for page in pdf.pages:
#print(page)
for table in page.extract_tables():
df = pd.DataFrame(table[1:], columns=table[0])
for index, row in df.iterrows():
if (isinstance(row[0], str) and len(row[0])>0):
rowid = row[0].replace(".","")
title = row[1].split("\n")
for i in range(0,len(title)):
if (re.search(u'[\u4e00-\u9fff]', title[i]) is None):
title[i] = ""
print(rowid, "".join(title))
def download_pdf ( pdf_file_name ) :
pdfFile = urllib.request.urlopen(f"https://www.chp.gov.hk/files/pdf/{pdf_file_name}")
file = open(f"./pdf/{pdf_file_name}", "wb")
file.write(pdfFile.read())
file.close()
main()
Comments
Post a Comment