Getting Started with PySpark: Difference between revisions
(Created page with "<pre> from pyspark.sql import SparkSession spark = SparkSession \ .builder \ .appName("Python Spark SQL basic example") \ .config("spark.some.config.option", "som...") |
No edit summary |
||
Line 15: | Line 15: | ||
format="csv", sep=",", inferSchema="true", header="true")</pre> | format="csv", sep=",", inferSchema="true", header="true")</pre> | ||
Find null | Find columns that are more than 90% null | ||
<pre>null_counts = df.select([F.count(F.when(F.col(c).isNull(), c)).alias(c) for c in df.columns]).collect()[0].asDict() | <pre> | ||
to_drop = [k for k, v in null_counts.items() if v | threshold = df.count() * .90 | ||
null_counts = df.select([F.count(F.when(F.col(c).isNull(), c)).alias(c) for c in df.columns]).collect()[0].asDict() | |||
to_drop = [k for k, v in null_counts.items() if v <= threshold ] | |||
</pre> | |||
Drop Null columns | Drop Null columns |
Revision as of 11:09, 30 October 2019
from pyspark.sql import SparkSession spark = SparkSession \ .builder \ .appName("Python Spark SQL basic example") \ .config("spark.some.config.option", "some-value") \ .getOrCreate() import pyspark.sql.functions as F
Load some data
df = spark.read.load("DEX03s - 2019-10-07.csv", format="csv", sep=",", inferSchema="true", header="true")
Find columns that are more than 90% null
threshold = df.count() * .90 null_counts = df.select([F.count(F.when(F.col(c).isNull(), c)).alias(c) for c in df.columns]).collect()[0].asDict() to_drop = [k for k, v in null_counts.items() if v <= threshold ]
Drop Null columns
clean = df.drop(*to_drop) display(clean)