0x01 Previously
24/01/14 04:31:12 ERROR SparkUncaughtExceptionHandler: [Container in shutdown] Uncaught exception in thread Thread[stdout writer for /share/dataproc/envs/py3.10/bin/python,5,main] java.lang.OutOfMemoryError: Requested array size exceeds VM limit at java.lang.StringCoding.encode(StringCoding.java:350) at java.lang.String.getBytes(String.java:941) at org.apache.spark.unsafe.types. UTF8String.fromString(UTF8String.java:139) at org.apache.spark.sql.execution.python. EvaluatePython$$anonfun$$nestedInanonfun$makeFromJava$11$1.applyOrElse(EvaluatePython.scala:149) at org.apache.spark.sql.execution.python. EvaluatePython$.nullSafeConvert(EvaluatePython.scala:213) at org.apache.spark.sql.execution.python. EvaluatePython$.$ anonfun$makeFromJava$11(EvaluatePython.scala:148) at org.apache.spark.sql.execution.python. EvaluatePython$$$Lambda$851/617702869.apply(Unknown Source) at org.apache.spark.sql.execution.python. EvaluatePython$$anonfun$$nestedInanonfun$makeFromJava$16$1.applyOrElse(EvaluatePython.scala:195) at org.apache.spark.sql.execution.python. EvaluatePython$.nullSafeConvert(EvaluatePython.scala:213) at org.apache.spark.sql.execution.python. EvaluatePython$.$ anonfun$makeFromJava$16(EvaluatePython.scala:182) at org.apache.spark.sql.execution.python. EvaluatePython$$$Lambda$929/208188683.apply(Unknown Source) at org.apache.spark.sql. SparkSession.$ anonfun$applySchemaToPythonRDD$2(SparkSession.scala:802) at org.apache.spark.sql. SparkSession$$Lambda$930/21820570.apply(Unknown Source) at scala.collection. Iterator$$anon$10.next(Iterator.scala:461) at org.apache.spark.sql.catalyst.expressions. GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source) at org.apache.spark.sql.execution. BufferedRowIterator.hasNext(BufferedRowIterator.java:43) at org.apache.spark.sql.execution. WholeStageCodegenExec$$anon$1.hasNext(WholeStageCodegenExec.scala:760) at scala.collection. Iterator$$anon$10.hasNext(Iterator.scala:460) at org.apache.spark.ContextAwareIterator.hasNext(ContextAwareIterator.scala:39) at org.apache.spark.api.python. SerDeUtil$AutoBatchedPickler.next(SerDeUtil.scala:89) at org.apache.spark.api.python. SerDeUtil$AutoBatchedPickler.next(SerDeUtil.scala:80) at scala.collection. Iterator.foreach(Iterator.scala:943) at scala.collection. Iterator.foreach$(Iterator.scala:943) at org.apache.spark.api.python. SerDeUtil$AutoBatchedPickler.foreach(SerDeUtil.scala:80) at org.apache.spark.api.python. PythonRDD$.writeIteratorToStream(PythonRDD.scala:320) at org.apache.spark.api.python. PythonRunner$$anon$2.writeIteratorToStream(PythonRunner.scala:734) at org.apache.spark.api.python. BasePythonRunner$WriterThread.$ anonfun$run$1(PythonRunner.scala:440) at org.apache.spark.api.python. BasePythonRunner$WriterThread$$Lambda$841/158147681.apply(Unknown Source) at org.apache.spark.util. Utils$.logUncaughtExceptions(Utils.scala:2088) at org.apache.spark.api.python. BasePythonRunner$WriterThread.run(PythonRunner.scala:274)
0x02 Solution selection
0x03 Compression scheme selection
Brotli is a lossless compression algorithm developed by Google, originally designed for HTTP content encoding. Brotli provides a good balance between compression efficiency and speed, especially when dealing with text and HTML content. It provides different levels of compression options, allowing users to choose between compression speed and compression effect. In addition, because of the broad support of Brotli, it can be used in various applications, including Web servers and browsers.
ZStandard is a lossless compression algorithm developed by Facebook and designed to achieve high compression ratio and speed. Zstd provides a wide selection of compression levels, allowing users to make trade-offs between compression speed and compression effect. Compared with Brotli, Zstd can usually provide faster compression and decompression speed, especially when dealing with large data sets.
0x04 Code writing
import gzip import time import brotli import zstandard import lzma import json import os import base64 #Define compression function def compress_with_method(input_str, output_file, compression_method, **kwargs): start = time.time() compressed = compression_method(input_str.encode(), **kwargs) with open(output_file, 'wb') as f_out: f_out.write(base64.b64encode(compressed)) end = time.time() compressed_size = os.path.getsize(output_file) original_size = len(input_str.encode()) compression_ratio = compressed_size / original_size return end - start, compression_ratio def compress_with_gzip(input_str, output_file, compresslevel): return compress_with_method(input_str, output_file, gzip.compress, compresslevel=compresslevel) def compress_with_brotli(input_str, output_file, quality): return compress_with_method(input_str, output_file, brotli.compress, quality=quality) def compress_with_zstandard(input_str, output_file, level): cctx = zstandard. ZstdCompressor(level=level) return compress_with_method(input_str, output_file, cctx.compress) def compress_with_lzma(input_str, output_file, preset): return compress_with_method(input_str, output_file, lzma.compress, preset=preset) #Defining Test Functions def test_compression(compression_name, compression_func, param_range, file, json_str, results): for param in param_range: output_file = f'test_ {compression_name}_ {param}_ {file}.compressed' elapsed_time, compression_ratio = compression_func(json_str, output_file, param) print(f"{compression_name} level {param} for {file}:") print(f"\tCompressed size: {os.path.getsize(output_file) / 1024 / 1024} MB") print(f"\tCompression time: {elapsed_time} seconds") print(f"\tCompression ratio: {compression_ratio}") results.append([file, f' {compression_name}_ {param}', elapsed_time, os.path.getsize(output_file), compression_ratio]) #Define File List files = ["simple1.json", "simple2.json"] results = [] #Compress each file for file in files: #Convert test data to JSON string with open(file, 'r', encoding='utf-8') as f_in: json_str = json.dumps(f_in.read()) #Calculate raw data size original_size = len(json_str.encode('utf-8')) #Test the quality of brotli (0~11) test_compression('brotli', compress_with_brotli, range(0, 12), file, json_str, results) #Test the level of zstandard (- 5~22) test_compression('zstandard', compress_with_zstandard, range(-5, 23), file, json_str, results) #Test the level of gzip (0-9) test_compression('gzip', compress_with_gzip, range(0, 10), file, json_str, results) #Test the level of lzma (0-9) test_compression('lzma', compress_with_lzma, range(0, 10), file, json_str, results) #Import csv module import csv #Open a file named 'compression_results. csv' in 'w' mode with open('compression_results.csv', 'w', encoding='utf-8', newline='') as f_out: #Create a csv write object writer = csv.writer(f_out) #Write Header writer.writerow(['file', 'algorithm', 'time', 'size', 'ratio']) #Write all results writer.writerows(results)
0x05 Statistical analysis