package com.bjzhanghao;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.parquet.example.data.Group;
import org.apache.parquet.format.converter.ParquetMetadataConverter;
import org.apache.parquet.hadoop.ParquetFileReader;
import org.apache.parquet.hadoop.ParquetReader;
import org.apache.parquet.hadoop.api.ReadSupport;
import org.apache.parquet.hadoop.example.GroupReadSupport;
import org.apache.parquet.hadoop.metadata.ParquetMetadata;
import org.apache.parquet.schema.MessageType;
import org.apache.parquet.schema.PrimitiveType;
import org.apache.parquet.schema.Type;
import org.apache.parquet.schema.Types;

import java.io.IOException;
import java.util.List;

/**
 * 示例程序，将数据以 org.apache.parquet.example.data.Group 的形式从Parquet文件里读出
 */
public class ReadParquetDemo {

    Configuration conf;

    public ReadParquetDemo() {
        conf = new Configuration();
        conf.set("fs.hdfs.impl",
                org.apache.hadoop.hdfs.DistributedFileSystem.class.getName()
        );
        conf.set("fs.file.impl",
                org.apache.hadoop.fs.LocalFileSystem.class.getName()
        );
    }

    /**
     * 读取Parquet文件的Schema
     * @param parquetPath
     * @return
     * @throws IOException
     */
    public List<Type> readSchemaFromParquet(Path parquetPath) throws IOException {
        ParquetMetadata readFooter = ParquetFileReader.readFooter(conf, parquetPath, ParquetMetadataConverter.NO_FILTER);
        MessageType schema = readFooter.getFileMetaData().getSchema();
        return schema.getFields();
    }

    /**
     * 这是错误的读取Parquet文件的示例，实际扫描了全部列。
     *
     * @param parquetPath
     * @param queryFields
     * @throws IOException
     */
    public void readParquetNoReadSchema(Path parquetPath, String[] queryFields) throws IOException {
        GroupReadSupport readSupport = new GroupReadSupport();
        ParquetReader.Builder<Group> readerBuilder = ParquetReader.builder(readSupport, parquetPath);
        ParquetReader<Group> reader = readerBuilder.build();
        Group line = null;
        while ((line = reader.read()) != null) {
            for (String field : queryFields) {
                line.getInteger(field, 0);
            }
        }
    }

    /**
     * 为发挥Parquet列式存储的优势，应配置PARQUET_READ_SCHEMA参数，跳过Parquet文件里不需要的那些列。
     *
     * @param parquetPath
     * @param queryFields
     * @throws IOException
     */
    public void readParquetWithReadSchema(Path parquetPath, String[] queryFields) throws IOException {
        // 将要读取的列配置到PARQUET_READ_SCHEMA，如果缺失这一步读取性能将严重降低
        Types.MessageTypeBuilder builder = Types.buildMessage();
        for (int j = 0; j < queryFields.length; j++) {
            builder.addField(new PrimitiveType(Type.Repetition.REQUIRED, PrimitiveType.PrimitiveTypeName.INT32, queryFields[j]));
        }
        MessageType messageType = builder.named("record");
        conf.set(ReadSupport.PARQUET_READ_SCHEMA, messageType.toString());

        // 读取Parquet文件
        GroupReadSupport readSupport = new GroupReadSupport();
        ParquetReader.Builder<Group> readerBuilder = ParquetReader.builder(readSupport, parquetPath);

        ParquetReader<Group> reader = readerBuilder.withConf(conf).build();
        Group line = null;
        while ((line = reader.read()) != null) {
            for (String field : queryFields) {
                line.getInteger(field, 0);
            }
        }
    }
}
