# Using R with Amazon Web Services for document analysis# https://aws.amazon.com/blogs/opensource/using-r-with-amazon-web-services-for-document-analysis/# This R program shows how you can use AWS with R to create a data pipeline for# extracting data from PDFs for future processing, using Textract, S3,# Relational Database Service (RDS).# The PDF document is from the Greenbook projections, a set of economic # projections made by the Federal Reserve from 1966 to the present. They are# available as a collection of PDFs from the Philadelphia Federal Reserve at:# https://www.philadelphiafed.org/research-and-data/real-time-center/greenbook-data/pdf-data-set# This R program expects that you have:# - An S3 bucket with PDFs# - An RDS PostgreSQL database with IAM authentication# - A user, instance, or container with IAM permissions to access Textract# and your RDS database# - R, and the packages paws, DBI, and RPostgres#-------------------------------------------------------------------------------# The location of the first Greenbook Projections PDF document in S3.bucket<-"my-bucket"file<-"GS-1966-01-11.pdf"textract<-paws::textract()#-------------------------------------------------------------------------------#-------------------------------------------------------------------------------#-------------------------------------------------------------------------------# Get a PDF document's tables using Amazon Textract.# --------------------------------------------------analyze_document<-function(bucket,file){# Start analyzing the PDF.resp<-textract$start_document_analysis(DocumentLocation=list(S3Object=list(Bucket=bucket,Name=file)),FeatureTypes="TABLES")# Check that the analysis is done and get the result.count<-0while(count<30&&(!exists("result")||result$JobStatus=="IN_PROGRESS")){Sys.sleep(1)result<-textract$get_document_analysis(JobId=resp$JobId)# If the result has multiple parts, get the remaining parts.next_token<-result$NextTokenwhile(length(next_token)>0){next_result<-textract$get_document_analysis(JobId=resp$JobId,NextToken=next_token)result$Blocks<-c(result$Blocks,next_result$Blocks)next_token<-next_result$NextToken}count<-count+1}return(result)}analysis<-analyze_document(bucket,file)#-------------------------------------------------------------------------------#-------------------------------------------------------------------------------#-------------------------------------------------------------------------------# Turn the Textract analysis result into matrices.# ------------------------------------------------# Get all children for a given block.get_children<-function(block,data){if(length(block$Relationships)==0){return(list())}idx<-which(sapply(block$Relationships,function(x)x$Type)=="CHILD")if(!idx){return(list())}child_ids<-block$Relationships[[idx]]$Idsresult<-data[child_ids]return(result)}# Get all tables for a given document analysis returned by `analyze_document`.get_tables<-function(analysis){blocks<-analysis$Blocksnames(blocks)<-sapply(blocks,function(x)x$Id)tables<-list()for(blockinblocks){if(block$BlockType=="TABLE"){cells<-get_children(block,blocks)rows<-max(sapply(cells,function(x)x$RowIndex))cols<-max(sapply(cells,function(x)x$ColumnIndex))table<-matrix(nrow=rows,ncol=cols)# 1. Go through a table's cells one-by-onefor(cellincells){# 2. Get the cell's contentswords<-get_children(cell,blocks)text<-paste(sapply(words,function(x)x$Text),collapse=" ")# 3. Insert the cell contents into the matrixrow<-cell$RowIndexcol<-cell$ColumnIndextable[row,col]<-text}tables<-c(tables,list(table))}}return(tables)}tables<-get_tables(analysis)#-------------------------------------------------------------------------------#-------------------------------------------------------------------------------#-------------------------------------------------------------------------------# Upload our results to our database.# -----------------------------------# Connect to the database using an IAM authentication token.rds<-paws::rds()token<-rds$build_auth_token("myhost:5432","us-east-1","david")con<-DBI::dbConnect(RPostgres::Postgres(),host="myhost",port=5432,dbname="mydb",user="david",password=token)# Create rows for each table to insert into the database.database_rows<-data.frame(document="GS-1966-01-11.pdf",table_num=1:length(tables),data=sapply(tables,jsonlite::toJSON))# Store the tables in the database.DBI::dbAppendTable(con,name="tables",value=database_rows)