<?xml version="1.0" encoding="UTF-8"?>
<rss version="2.0" xmlns:atom="http://www.w3.org/2005/Atom" xmlns:dc="http://purl.org/dc/elements/1.1/">
  <channel>
    <title>Forem: Jackson Freitas</title>
    <description>The latest articles on Forem by Jackson Freitas (@jackson_freitas_42cbeaf74).</description>
    <link>https://forem.com/jackson_freitas_42cbeaf74</link>
    <image>
      <url>https://media2.dev.to/dynamic/image/width=90,height=90,fit=cover,gravity=auto,format=auto/https:%2F%2Fdev-to-uploads.s3.amazonaws.com%2Fuploads%2Fuser%2Fprofile_image%2F3557662%2Ff7f3c63e-7a6b-4501-87b4-f80e0c069d43.jpg</url>
      <title>Forem: Jackson Freitas</title>
      <link>https://forem.com/jackson_freitas_42cbeaf74</link>
    </image>
    <atom:link rel="self" type="application/rss+xml" href="https://forem.com/feed/jackson_freitas_42cbeaf74"/>
    <language>en</language>
    <item>
      <title>Exploring Brazilian E-commerce with Spark on Databricks Free Edition</title>
      <dc:creator>Jackson Freitas</dc:creator>
      <pubDate>Wed, 15 Oct 2025 01:07:08 +0000</pubDate>
      <link>https://forem.com/jackson_freitas_42cbeaf74/exploring-brazilian-e-commerce-with-spark-on-databricks-free-edition-4lb6</link>
      <guid>https://forem.com/jackson_freitas_42cbeaf74/exploring-brazilian-e-commerce-with-spark-on-databricks-free-edition-4lb6</guid>
      <description>&lt;p&gt;The goal of this project is to explore the &lt;strong&gt;Olist dataset&lt;/strong&gt; about Brazilian e-commerce using &lt;strong&gt;Apache Spark&lt;/strong&gt;.&lt;/p&gt;

&lt;p&gt;&lt;strong&gt;Source:&lt;/strong&gt; &lt;a href="https://www.kaggle.com/datasets/olistbr/brazilian-ecommerce" rel="noopener noreferrer"&gt;Brazilian E-commerce (Kaggle)&lt;/a&gt;&lt;br&gt;
&lt;strong&gt;Engine:&lt;/strong&gt; Spark&lt;br&gt;
&lt;strong&gt;Environment:&lt;/strong&gt; Databricks Notebook&lt;/p&gt;


&lt;h2&gt;
  
  
  1️⃣ Downloading the Dataset via API into Databricks Volumes
&lt;/h2&gt;

&lt;blockquote&gt;
&lt;p&gt;💡 &lt;em&gt;Note:&lt;/em&gt; For Kaggle authentication, I generated the API token and used the &lt;strong&gt;Databricks CLI&lt;/strong&gt; to set a secret with the token.&lt;br&gt;
&lt;/p&gt;


&lt;/blockquote&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight python"&gt;&lt;code&gt;&lt;span class="kn"&gt;import&lt;/span&gt; &lt;span class="n"&gt;os&lt;/span&gt;
&lt;span class="kn"&gt;import&lt;/span&gt; &lt;span class="n"&gt;json&lt;/span&gt;
&lt;span class="kn"&gt;from&lt;/span&gt; &lt;span class="n"&gt;kaggle.api.kaggle_api_extended&lt;/span&gt; &lt;span class="kn"&gt;import&lt;/span&gt; &lt;span class="n"&gt;KaggleApi&lt;/span&gt;

&lt;span class="n"&gt;kaggle_token&lt;/span&gt; &lt;span class="o"&gt;=&lt;/span&gt; &lt;span class="n"&gt;dbutils&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="n"&gt;secrets&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="nf"&gt;get&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="s"&gt;kaggle&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt; &lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="s"&gt;kaggle-token&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt;

&lt;span class="c1"&gt;# expand home directory for the current user
&lt;/span&gt;&lt;span class="n"&gt;kaggle_dir&lt;/span&gt; &lt;span class="o"&gt;=&lt;/span&gt; &lt;span class="n"&gt;os&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="n"&gt;path&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="nf"&gt;expanduser&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="s"&gt;~/.config/kaggle&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt;
&lt;span class="n"&gt;os&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="nf"&gt;makedirs&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="n"&gt;kaggle_dir&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt; &lt;span class="n"&gt;exist_ok&lt;/span&gt;&lt;span class="o"&gt;=&lt;/span&gt;&lt;span class="bp"&gt;True&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt;
&lt;span class="n"&gt;kaggle_json_path&lt;/span&gt; &lt;span class="o"&gt;=&lt;/span&gt; &lt;span class="n"&gt;os&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="n"&gt;path&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="nf"&gt;join&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="n"&gt;kaggle_dir&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt; &lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="s"&gt;kaggle.json&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt;

&lt;span class="nf"&gt;print&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="n"&gt;kaggle_json_path&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt;

&lt;span class="k"&gt;with&lt;/span&gt; &lt;span class="nf"&gt;open&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="n"&gt;kaggle_json_path&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt; &lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="s"&gt;w&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt; &lt;span class="k"&gt;as&lt;/span&gt; &lt;span class="n"&gt;f&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt;
    &lt;span class="n"&gt;f&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="nf"&gt;write&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="n"&gt;kaggle_token&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt;

&lt;span class="n"&gt;dataset_identifier&lt;/span&gt; &lt;span class="o"&gt;=&lt;/span&gt; &lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="s"&gt;olistbr/brazilian-ecommerce&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;
&lt;span class="n"&gt;volume_path&lt;/span&gt; &lt;span class="o"&gt;=&lt;/span&gt; &lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="s"&gt;/Volumes/dev/raw/brazilian-ecommerce&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;

&lt;span class="n"&gt;api&lt;/span&gt; &lt;span class="o"&gt;=&lt;/span&gt; &lt;span class="nc"&gt;KaggleApi&lt;/span&gt;&lt;span class="p"&gt;()&lt;/span&gt;
&lt;span class="n"&gt;api&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="nf"&gt;authenticate&lt;/span&gt;&lt;span class="p"&gt;()&lt;/span&gt;

&lt;span class="n"&gt;api&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="nf"&gt;dataset_download_files&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="n"&gt;dataset_identifier&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt; &lt;span class="n"&gt;path&lt;/span&gt;&lt;span class="o"&gt;=&lt;/span&gt;&lt;span class="n"&gt;volume_path&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt; &lt;span class="n"&gt;unzip&lt;/span&gt;&lt;span class="o"&gt;=&lt;/span&gt;&lt;span class="bp"&gt;True&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt;
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;📂 Each CSV file represents a different table:&lt;br&gt;
&lt;a href="https://media2.dev.to/dynamic/image/width=800%2Cheight=%2Cfit=scale-down%2Cgravity=auto%2Cformat=auto/https%3A%2F%2Fdev-to-uploads.s3.amazonaws.com%2Fuploads%2Farticles%2Ftkrw431qchcxev2gab0q.png" class="article-body-image-wrapper"&gt;&lt;img src="https://media2.dev.to/dynamic/image/width=800%2Cheight=%2Cfit=scale-down%2Cgravity=auto%2Cformat=auto/https%3A%2F%2Fdev-to-uploads.s3.amazonaws.com%2Fuploads%2Farticles%2Ftkrw431qchcxev2gab0q.png" alt="Dataset Tables" width="800" height="297"&gt;&lt;/a&gt;&lt;/p&gt;


&lt;h2&gt;
  
  
  2️⃣ Data Analysis
&lt;/h2&gt;

&lt;p&gt;&lt;strong&gt;First, I set the base reader:&lt;/strong&gt;&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight python"&gt;&lt;code&gt;&lt;span class="n"&gt;base_path&lt;/span&gt; &lt;span class="o"&gt;=&lt;/span&gt; &lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;/Volumes/dev/raw/brazilian-ecommerce&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;






&lt;h3&gt;
  
  
  What is the number of orders and the average payment value per status?
&lt;/h3&gt;



&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight python"&gt;&lt;code&gt;&lt;span class="kn"&gt;from&lt;/span&gt; &lt;span class="n"&gt;pyspark.sql.functions&lt;/span&gt; &lt;span class="kn"&gt;import&lt;/span&gt; &lt;span class="n"&gt;avg&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt; &lt;span class="nb"&gt;sum&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt; &lt;span class="n"&gt;count&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt; &lt;span class="n"&gt;col&lt;/span&gt;

&lt;span class="p"&gt;(&lt;/span&gt;
    &lt;span class="n"&gt;orders_payments&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="nf"&gt;groupBy&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;order_status&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt;
        &lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="nf"&gt;agg&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;
            &lt;span class="nf"&gt;count&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="s"&gt;*&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="p"&gt;).&lt;/span&gt;&lt;span class="nf"&gt;alias&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;total_orders&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="p"&gt;),&lt;/span&gt;
            &lt;span class="nf"&gt;avg&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="nf"&gt;col&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;payment_value&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="p"&gt;)).&lt;/span&gt;&lt;span class="nf"&gt;alias&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;avg_payment&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt;
        &lt;span class="p"&gt;)&lt;/span&gt;
        &lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="nf"&gt;orderBy&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;avg_payment&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt; &lt;span class="n"&gt;ascending&lt;/span&gt;&lt;span class="o"&gt;=&lt;/span&gt;&lt;span class="bp"&gt;False&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt;
        &lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="nf"&gt;show&lt;/span&gt;&lt;span class="p"&gt;()&lt;/span&gt;
&lt;span class="p"&gt;)&lt;/span&gt;

&lt;span class="c1"&gt;# Output
&lt;/span&gt;&lt;span class="o"&gt;+------------+------------+------------------+&lt;/span&gt;
&lt;span class="o"&gt;|&lt;/span&gt;&lt;span class="n"&gt;order_status&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;&lt;span class="n"&gt;total_orders&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;       &lt;span class="n"&gt;avg_payment&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;
&lt;span class="o"&gt;+------------+------------+------------------+&lt;/span&gt;
&lt;span class="o"&gt;|&lt;/span&gt;  &lt;span class="n"&gt;processing&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;         &lt;span class="mi"&gt;319&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;&lt;span class="mf"&gt;217.53639498432602&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;
&lt;span class="o"&gt;|&lt;/span&gt;    &lt;span class="n"&gt;canceled&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;         &lt;span class="mi"&gt;664&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;&lt;span class="mf"&gt;215.74638554216864&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;
&lt;span class="o"&gt;|&lt;/span&gt;    &lt;span class="n"&gt;invoiced&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;         &lt;span class="mi"&gt;325&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;&lt;span class="mf"&gt;212.73227692307694&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;
&lt;span class="o"&gt;|&lt;/span&gt; &lt;span class="n"&gt;unavailable&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;         &lt;span class="mi"&gt;649&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;&lt;span class="mf"&gt;194.88368258859782&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;
&lt;span class="o"&gt;|&lt;/span&gt;   &lt;span class="n"&gt;delivered&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;      &lt;span class="mi"&gt;100756&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;&lt;span class="mf"&gt;153.06742794473817&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;
&lt;span class="o"&gt;|&lt;/span&gt;     &lt;span class="n"&gt;shipped&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;        &lt;span class="mi"&gt;1166&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt; &lt;span class="mf"&gt;151.9845283018868&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;
&lt;span class="o"&gt;|&lt;/span&gt;     &lt;span class="n"&gt;created&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;           &lt;span class="mi"&gt;5&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;            &lt;span class="mf"&gt;137.62&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;
&lt;span class="o"&gt;|&lt;/span&gt;    &lt;span class="n"&gt;approved&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;           &lt;span class="mi"&gt;2&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;            &lt;span class="mf"&gt;120.54&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;
&lt;span class="o"&gt;+------------+------------+------------------+&lt;/span&gt;
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;






&lt;h3&gt;
  
  
  What is the number of orders per state?
&lt;/h3&gt;



&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight python"&gt;&lt;code&gt;&lt;span class="kn"&gt;from&lt;/span&gt; &lt;span class="n"&gt;pyspark.sql.functions&lt;/span&gt; &lt;span class="kn"&gt;import&lt;/span&gt; &lt;span class="n"&gt;col&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt; &lt;span class="n"&gt;cast&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt; &lt;span class="n"&gt;count&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt; &lt;span class="nb"&gt;sum&lt;/span&gt;

&lt;span class="n"&gt;customers&lt;/span&gt; &lt;span class="o"&gt;=&lt;/span&gt; &lt;span class="p"&gt;(&lt;/span&gt;
    &lt;span class="n"&gt;spark&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="n"&gt;read&lt;/span&gt;
         &lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="nf"&gt;options&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="n"&gt;header&lt;/span&gt;&lt;span class="o"&gt;=&lt;/span&gt;&lt;span class="bp"&gt;True&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt; &lt;span class="n"&gt;inferSchema&lt;/span&gt;&lt;span class="o"&gt;=&lt;/span&gt;&lt;span class="bp"&gt;True&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt;
         &lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="nf"&gt;csv&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="sa"&gt;f&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="si"&gt;{&lt;/span&gt;&lt;span class="n"&gt;base_path&lt;/span&gt;&lt;span class="si"&gt;}&lt;/span&gt;&lt;span class="s"&gt;/olist_customers_dataset.csv&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt;
&lt;span class="p"&gt;)&lt;/span&gt;

&lt;span class="n"&gt;customers_payments&lt;/span&gt; &lt;span class="o"&gt;=&lt;/span&gt; &lt;span class="n"&gt;orders_payments&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="nf"&gt;join&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="n"&gt;customers&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt; &lt;span class="n"&gt;on&lt;/span&gt;&lt;span class="o"&gt;=&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;customer_id&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt; &lt;span class="n"&gt;how&lt;/span&gt;&lt;span class="o"&gt;=&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;inner&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt;

&lt;span class="n"&gt;orders_by_state&lt;/span&gt; &lt;span class="o"&gt;=&lt;/span&gt; &lt;span class="p"&gt;(&lt;/span&gt;
    &lt;span class="n"&gt;customers_payments&lt;/span&gt;
    &lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="nf"&gt;groupBy&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;customer_state&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt;
    &lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="nf"&gt;agg&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;
        &lt;span class="nf"&gt;count&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="s"&gt;*&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="p"&gt;).&lt;/span&gt;&lt;span class="nf"&gt;alias&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;order_count&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="p"&gt;),&lt;/span&gt;
        &lt;span class="nf"&gt;sum&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="nf"&gt;col&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;payment_value&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="p"&gt;).&lt;/span&gt;&lt;span class="nf"&gt;cast&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;DECIMAL(10,2)&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="p"&gt;)).&lt;/span&gt;&lt;span class="nf"&gt;alias&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;total_payments&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt;
    &lt;span class="p"&gt;)&lt;/span&gt;
    &lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="nf"&gt;orderBy&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;order_count&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt; &lt;span class="n"&gt;ascending&lt;/span&gt;&lt;span class="o"&gt;=&lt;/span&gt;&lt;span class="bp"&gt;False&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt;
&lt;span class="p"&gt;)&lt;/span&gt;

&lt;span class="n"&gt;orders_by_state&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="nf"&gt;show&lt;/span&gt;&lt;span class="p"&gt;()&lt;/span&gt;

&lt;span class="c1"&gt;# Output
&lt;/span&gt;&lt;span class="o"&gt;+--------------+-----------+--------------+&lt;/span&gt;
&lt;span class="o"&gt;|&lt;/span&gt;&lt;span class="n"&gt;customer_state&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;&lt;span class="n"&gt;order_count&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;&lt;span class="n"&gt;total_payments&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;
&lt;span class="o"&gt;+--------------+-----------+--------------+&lt;/span&gt;
&lt;span class="o"&gt;|&lt;/span&gt;            &lt;span class="n"&gt;SP&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;      &lt;span class="mi"&gt;43622&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;    &lt;span class="mf"&gt;5998226.96&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;
&lt;span class="o"&gt;|&lt;/span&gt;            &lt;span class="n"&gt;RJ&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;      &lt;span class="mi"&gt;13527&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;    &lt;span class="mf"&gt;2144379.69&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;
&lt;span class="o"&gt;|&lt;/span&gt;            &lt;span class="n"&gt;MG&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;      &lt;span class="mi"&gt;12102&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;    &lt;span class="mf"&gt;1872257.26&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;
&lt;span class="o"&gt;|&lt;/span&gt;            &lt;span class="n"&gt;RS&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;       &lt;span class="mi"&gt;5668&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;     &lt;span class="mf"&gt;890898.54&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;
&lt;span class="o"&gt;|&lt;/span&gt;            &lt;span class="n"&gt;PR&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;       &lt;span class="mi"&gt;5262&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;     &lt;span class="mf"&gt;811156.38&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;
&lt;span class="o"&gt;|&lt;/span&gt;            &lt;span class="n"&gt;SC&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;       &lt;span class="mi"&gt;3754&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;     &lt;span class="mf"&gt;623086.43&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;
&lt;span class="o"&gt;|&lt;/span&gt;            &lt;span class="n"&gt;BA&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;       &lt;span class="mi"&gt;3610&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;     &lt;span class="mf"&gt;616645.82&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;
&lt;span class="o"&gt;|&lt;/span&gt;            &lt;span class="n"&gt;DF&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;       &lt;span class="mi"&gt;2204&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;     &lt;span class="mf"&gt;355141.08&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;
&lt;span class="o"&gt;|&lt;/span&gt;            &lt;span class="n"&gt;GO&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;       &lt;span class="mi"&gt;2112&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;     &lt;span class="mf"&gt;350092.31&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;
&lt;span class="o"&gt;|&lt;/span&gt;            &lt;span class="n"&gt;ES&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;       &lt;span class="mi"&gt;2107&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;     &lt;span class="mf"&gt;325967.55&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;
&lt;span class="o"&gt;|&lt;/span&gt;            &lt;span class="n"&gt;PE&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;       &lt;span class="mi"&gt;1728&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;     &lt;span class="mf"&gt;324850.44&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;
&lt;span class="o"&gt;|&lt;/span&gt;            &lt;span class="n"&gt;CE&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;       &lt;span class="mi"&gt;1398&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;     &lt;span class="mf"&gt;279464.03&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;
&lt;span class="o"&gt;|&lt;/span&gt;            &lt;span class="n"&gt;PA&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;       &lt;span class="mi"&gt;1011&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;     &lt;span class="mf"&gt;218295.85&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;
&lt;span class="o"&gt;|&lt;/span&gt;            &lt;span class="n"&gt;MT&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;        &lt;span class="mi"&gt;958&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;     &lt;span class="mf"&gt;187029.29&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;
&lt;span class="o"&gt;|&lt;/span&gt;            &lt;span class="n"&gt;MA&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;        &lt;span class="mi"&gt;767&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;     &lt;span class="mf"&gt;152523.02&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;
&lt;span class="o"&gt;|&lt;/span&gt;            &lt;span class="n"&gt;MS&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;        &lt;span class="mi"&gt;736&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;     &lt;span class="mf"&gt;137534.84&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;
&lt;span class="o"&gt;|&lt;/span&gt;            &lt;span class="n"&gt;PB&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;        &lt;span class="mi"&gt;570&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;     &lt;span class="mf"&gt;141545.72&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;
&lt;span class="o"&gt;|&lt;/span&gt;            &lt;span class="n"&gt;PI&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;        &lt;span class="mi"&gt;524&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;     &lt;span class="mf"&gt;108523.97&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;
&lt;span class="o"&gt;|&lt;/span&gt;            &lt;span class="n"&gt;RN&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;        &lt;span class="mi"&gt;522&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;     &lt;span class="mf"&gt;102718.13&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;
&lt;span class="o"&gt;|&lt;/span&gt;            &lt;span class="n"&gt;AL&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;        &lt;span class="mi"&gt;427&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;      &lt;span class="mf"&gt;96962.06&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;
&lt;span class="o"&gt;+--------------+-----------+--------------+&lt;/span&gt;
&lt;span class="n"&gt;only&lt;/span&gt; &lt;span class="n"&gt;showing&lt;/span&gt; &lt;span class="n"&gt;top&lt;/span&gt; &lt;span class="mi"&gt;20&lt;/span&gt; &lt;span class="n"&gt;rows&lt;/span&gt;
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;






&lt;h3&gt;
  
  
  What is the average ticket per customer?
&lt;/h3&gt;



&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight python"&gt;&lt;code&gt;&lt;span class="kn"&gt;from&lt;/span&gt; &lt;span class="n"&gt;pyspark.sql.functions&lt;/span&gt; &lt;span class="kn"&gt;import&lt;/span&gt; &lt;span class="n"&gt;avg&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt; &lt;span class="n"&gt;col&lt;/span&gt;

&lt;span class="p"&gt;(&lt;/span&gt;
    &lt;span class="n"&gt;customers_payments&lt;/span&gt;
    &lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="nf"&gt;groupBy&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;customer_id&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt;
    &lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="nf"&gt;agg&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;
        &lt;span class="nf"&gt;avg&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="nf"&gt;col&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;payment_value&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="p"&gt;).&lt;/span&gt;&lt;span class="nf"&gt;cast&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;DECIMAL(8,2)&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="p"&gt;)).&lt;/span&gt;&lt;span class="nf"&gt;alias&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;avg_ticket&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt;
    &lt;span class="p"&gt;)&lt;/span&gt;
    &lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="nf"&gt;orderBy&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;avg_ticket&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt;
    &lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="nf"&gt;show&lt;/span&gt;&lt;span class="p"&gt;()&lt;/span&gt;
&lt;span class="p"&gt;)&lt;/span&gt;

&lt;span class="c1"&gt;# Output
&lt;/span&gt;&lt;span class="o"&gt;+--------------------+----------+&lt;/span&gt;
&lt;span class="o"&gt;|&lt;/span&gt;         &lt;span class="n"&gt;customer_id&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;&lt;span class="n"&gt;avg_ticket&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;
&lt;span class="o"&gt;+--------------------+----------+&lt;/span&gt;
&lt;span class="o"&gt;|&lt;/span&gt;&lt;span class="mi"&gt;197&lt;/span&gt;&lt;span class="n"&gt;a2a6a77da93f67&lt;/span&gt;&lt;span class="p"&gt;...&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;  &lt;span class="mf"&gt;0.000000&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;
&lt;span class="o"&gt;|&lt;/span&gt;&lt;span class="mi"&gt;3532&lt;/span&gt;&lt;span class="n"&gt;ba38a3fd24225&lt;/span&gt;&lt;span class="p"&gt;...&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;  &lt;span class="mf"&gt;0.000000&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;
&lt;span class="o"&gt;|&lt;/span&gt;&lt;span class="n"&gt;a73c1f73f5772cf80&lt;/span&gt;&lt;span class="p"&gt;...&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;  &lt;span class="mf"&gt;0.000000&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;
&lt;span class="o"&gt;|&lt;/span&gt;&lt;span class="n"&gt;fd123d346a17cdf5e&lt;/span&gt;&lt;span class="p"&gt;...&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;  &lt;span class="mf"&gt;1.737500&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;
&lt;span class="o"&gt;|&lt;/span&gt;&lt;span class="n"&gt;b246eeed30b362c09&lt;/span&gt;&lt;span class="p"&gt;...&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;  &lt;span class="mf"&gt;1.856818&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;
&lt;span class="o"&gt;|&lt;/span&gt;&lt;span class="mi"&gt;92&lt;/span&gt;&lt;span class="n"&gt;cd3ec6e2d643d4e&lt;/span&gt;&lt;span class="p"&gt;...&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;  &lt;span class="mf"&gt;2.410769&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;
&lt;span class="o"&gt;|&lt;/span&gt;&lt;span class="n"&gt;fc01c21e3a2b27c4d&lt;/span&gt;&lt;span class="p"&gt;...&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;  &lt;span class="mf"&gt;3.166667&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;
&lt;span class="o"&gt;|&lt;/span&gt;&lt;span class="n"&gt;b6f7351952c806b73&lt;/span&gt;&lt;span class="p"&gt;...&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;  &lt;span class="mf"&gt;3.776667&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;
&lt;span class="o"&gt;|&lt;/span&gt;&lt;span class="mi"&gt;04&lt;/span&gt;&lt;span class="n"&gt;bac030c03668923&lt;/span&gt;&lt;span class="p"&gt;...&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;  &lt;span class="mf"&gt;3.870000&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;
&lt;span class="o"&gt;|&lt;/span&gt;&lt;span class="mi"&gt;30&lt;/span&gt;&lt;span class="n"&gt;a9fd4c676d1e0be&lt;/span&gt;&lt;span class="p"&gt;...&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;  &lt;span class="mf"&gt;4.242500&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;
&lt;span class="o"&gt;|&lt;/span&gt;&lt;span class="n"&gt;eed931d3a5222a9a5&lt;/span&gt;&lt;span class="p"&gt;...&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;  &lt;span class="mf"&gt;4.354211&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;
&lt;span class="o"&gt;|&lt;/span&gt;&lt;span class="n"&gt;a2afcfb0d0d309657&lt;/span&gt;&lt;span class="p"&gt;...&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;  &lt;span class="mf"&gt;4.375000&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;
&lt;span class="o"&gt;|&lt;/span&gt;&lt;span class="mi"&gt;7&lt;/span&gt;&lt;span class="n"&gt;a0a62073458a64b4&lt;/span&gt;&lt;span class="p"&gt;...&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;  &lt;span class="mf"&gt;4.464000&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;
&lt;span class="o"&gt;|&lt;/span&gt;&lt;span class="n"&gt;d3b38af3b96edca0f&lt;/span&gt;&lt;span class="p"&gt;...&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;  &lt;span class="mf"&gt;4.698333&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;
&lt;span class="o"&gt;|&lt;/span&gt;&lt;span class="n"&gt;a790343ca6f3fee08&lt;/span&gt;&lt;span class="p"&gt;...&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;  &lt;span class="mf"&gt;4.795000&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;
&lt;span class="o"&gt;|&lt;/span&gt;&lt;span class="n"&gt;e73a40f7509a5e81a&lt;/span&gt;&lt;span class="p"&gt;...&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;  &lt;span class="mf"&gt;4.857143&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;
&lt;span class="o"&gt;|&lt;/span&gt;&lt;span class="n"&gt;c5ea6b40204131fb4&lt;/span&gt;&lt;span class="p"&gt;...&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;  &lt;span class="mf"&gt;4.903333&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;
&lt;span class="o"&gt;|&lt;/span&gt;&lt;span class="n"&gt;e8b585de845954e2d&lt;/span&gt;&lt;span class="p"&gt;...&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;  &lt;span class="mf"&gt;4.940000&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;
&lt;span class="o"&gt;|&lt;/span&gt;&lt;span class="n"&gt;c98632bdc4c3bd206&lt;/span&gt;&lt;span class="p"&gt;...&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;  &lt;span class="mf"&gt;5.298333&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;
&lt;span class="o"&gt;|&lt;/span&gt;&lt;span class="mi"&gt;7887&lt;/span&gt;&lt;span class="n"&gt;f43daaa91055f&lt;/span&gt;&lt;span class="p"&gt;...&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;  &lt;span class="mf"&gt;5.333333&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;
&lt;span class="o"&gt;+--------------------+----------+&lt;/span&gt;
&lt;span class="n"&gt;only&lt;/span&gt; &lt;span class="n"&gt;showing&lt;/span&gt; &lt;span class="n"&gt;top&lt;/span&gt; &lt;span class="mi"&gt;20&lt;/span&gt; &lt;span class="n"&gt;rows&lt;/span&gt;
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;






&lt;h3&gt;
  
  
  What is the average number of items per order?
&lt;/h3&gt;



&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight python"&gt;&lt;code&gt;&lt;span class="n"&gt;order_items&lt;/span&gt; &lt;span class="o"&gt;=&lt;/span&gt; &lt;span class="n"&gt;spark&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="n"&gt;read&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="nf"&gt;options&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="n"&gt;header&lt;/span&gt;&lt;span class="o"&gt;=&lt;/span&gt;&lt;span class="bp"&gt;True&lt;/span&gt;&lt;span class="p"&gt;).&lt;/span&gt;&lt;span class="nf"&gt;csv&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="sa"&gt;f&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="si"&gt;{&lt;/span&gt;&lt;span class="n"&gt;base_path&lt;/span&gt;&lt;span class="si"&gt;}&lt;/span&gt;&lt;span class="s"&gt;/olist_order_items_dataset.csv&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt;

&lt;span class="n"&gt;orders_order_items&lt;/span&gt; &lt;span class="o"&gt;=&lt;/span&gt; &lt;span class="n"&gt;orders&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="nf"&gt;join&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="n"&gt;order_items&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt; &lt;span class="n"&gt;on&lt;/span&gt;&lt;span class="o"&gt;=&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;order_id&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt;

&lt;span class="n"&gt;items_per_order&lt;/span&gt; &lt;span class="o"&gt;=&lt;/span&gt; &lt;span class="p"&gt;(&lt;/span&gt;
    &lt;span class="n"&gt;orders_order_items&lt;/span&gt;
    &lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="nf"&gt;groupBy&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;order_id&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt;
    &lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="nf"&gt;agg&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="nf"&gt;count&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="s"&gt;order_item_id&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="p"&gt;).&lt;/span&gt;&lt;span class="nf"&gt;alias&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="s"&gt;item_count&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="p"&gt;))&lt;/span&gt;
&lt;span class="p"&gt;)&lt;/span&gt;

&lt;span class="n"&gt;avg_items_per_order&lt;/span&gt; &lt;span class="o"&gt;=&lt;/span&gt; &lt;span class="n"&gt;items_per_order&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="nf"&gt;agg&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;
    &lt;span class="nf"&gt;avg&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;item_count&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="p"&gt;).&lt;/span&gt;&lt;span class="nf"&gt;alias&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;avg_items_per_order&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt;
&lt;span class="p"&gt;)&lt;/span&gt;

&lt;span class="nf"&gt;display&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="n"&gt;avg_items_per_order&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt;
&lt;span class="nf"&gt;display&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="n"&gt;items_per_order&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt;

&lt;span class="c1"&gt;# Outputs
&lt;/span&gt;&lt;span class="o"&gt;+-------------------+&lt;/span&gt;
&lt;span class="o"&gt;|&lt;/span&gt;&lt;span class="n"&gt;avg_items_per_order&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;
&lt;span class="o"&gt;+-------------------+&lt;/span&gt;
&lt;span class="o"&gt;|&lt;/span&gt; &lt;span class="mf"&gt;1.1417306873695092&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;
&lt;span class="o"&gt;+-------------------+&lt;/span&gt;

&lt;span class="o"&gt;+--------------------+----------+&lt;/span&gt;
&lt;span class="o"&gt;|&lt;/span&gt;            &lt;span class="n"&gt;order_id&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;&lt;span class="n"&gt;item_count&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;
&lt;span class="o"&gt;+--------------------+----------+&lt;/span&gt;
&lt;span class="o"&gt;|&lt;/span&gt;&lt;span class="n"&gt;ccbabeb0b02433bd0&lt;/span&gt;&lt;span class="p"&gt;...&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;         &lt;span class="mi"&gt;1&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;
&lt;span class="o"&gt;|&lt;/span&gt;&lt;span class="n"&gt;c6bf92017bd40729c&lt;/span&gt;&lt;span class="p"&gt;...&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;         &lt;span class="mi"&gt;1&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;
&lt;span class="o"&gt;|&lt;/span&gt;&lt;span class="n"&gt;ab87dc5a5f1856a10&lt;/span&gt;&lt;span class="p"&gt;...&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;         &lt;span class="mi"&gt;1&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;
&lt;span class="o"&gt;|&lt;/span&gt;&lt;span class="mi"&gt;06&lt;/span&gt;&lt;span class="n"&gt;ff862a85c2402aa&lt;/span&gt;&lt;span class="p"&gt;...&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;         &lt;span class="mi"&gt;1&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;
&lt;span class="o"&gt;|&lt;/span&gt;&lt;span class="n"&gt;f23155f5fa9b82663&lt;/span&gt;&lt;span class="p"&gt;...&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;         &lt;span class="mi"&gt;1&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;
&lt;span class="o"&gt;|&lt;/span&gt;&lt;span class="mi"&gt;69&lt;/span&gt;&lt;span class="n"&gt;fd81b0cd556f5da&lt;/span&gt;&lt;span class="p"&gt;...&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;         &lt;span class="mi"&gt;1&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;
&lt;span class="o"&gt;|&lt;/span&gt;&lt;span class="n"&gt;d40dd8018a5302969&lt;/span&gt;&lt;span class="p"&gt;...&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;         &lt;span class="mi"&gt;1&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;
&lt;span class="o"&gt;|&lt;/span&gt;&lt;span class="mi"&gt;42560&lt;/span&gt;&lt;span class="n"&gt;dfc8d7863a19&lt;/span&gt;&lt;span class="p"&gt;...&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;         &lt;span class="mi"&gt;2&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;
&lt;span class="o"&gt;|&lt;/span&gt;&lt;span class="mi"&gt;3&lt;/span&gt;&lt;span class="n"&gt;f003568147c78508&lt;/span&gt;&lt;span class="p"&gt;...&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;         &lt;span class="mi"&gt;1&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;
&lt;span class="o"&gt;|&lt;/span&gt;&lt;span class="n"&gt;db192ddb0ea5a4d7a&lt;/span&gt;&lt;span class="p"&gt;...&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;         &lt;span class="mi"&gt;1&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;
&lt;span class="o"&gt;|&lt;/span&gt;&lt;span class="mi"&gt;5691&lt;/span&gt;&lt;span class="n"&gt;d72069359cd29&lt;/span&gt;&lt;span class="p"&gt;...&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;         &lt;span class="mi"&gt;1&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;
&lt;span class="o"&gt;|&lt;/span&gt;&lt;span class="mi"&gt;8&lt;/span&gt;&lt;span class="n"&gt;cae6053f4694ebc0&lt;/span&gt;&lt;span class="p"&gt;...&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;         &lt;span class="mi"&gt;1&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;
&lt;span class="o"&gt;|&lt;/span&gt;&lt;span class="n"&gt;c1784064d438058cf&lt;/span&gt;&lt;span class="p"&gt;...&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;         &lt;span class="mi"&gt;1&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;
&lt;span class="o"&gt;|&lt;/span&gt;&lt;span class="mi"&gt;56&lt;/span&gt;&lt;span class="n"&gt;cbdf3f7e3f53568&lt;/span&gt;&lt;span class="p"&gt;...&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;         &lt;span class="mi"&gt;1&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;
&lt;span class="o"&gt;|&lt;/span&gt;&lt;span class="n"&gt;dfda6b8e30bc9ac25&lt;/span&gt;&lt;span class="p"&gt;...&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;         &lt;span class="mi"&gt;3&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;
&lt;span class="o"&gt;|&lt;/span&gt;&lt;span class="mi"&gt;322&lt;/span&gt;&lt;span class="n"&gt;d561e43a3a0c58&lt;/span&gt;&lt;span class="p"&gt;...&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;         &lt;span class="mi"&gt;2&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;
&lt;span class="o"&gt;|&lt;/span&gt;&lt;span class="n"&gt;d6a6d6c3e46448b61&lt;/span&gt;&lt;span class="p"&gt;...&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;         &lt;span class="mi"&gt;1&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;
&lt;span class="o"&gt;|&lt;/span&gt;&lt;span class="mi"&gt;387&lt;/span&gt;&lt;span class="n"&gt;ca56ee49ac7729&lt;/span&gt;&lt;span class="p"&gt;...&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;         &lt;span class="mi"&gt;1&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;
&lt;span class="o"&gt;|&lt;/span&gt;&lt;span class="n"&gt;c4de71df747f541f5&lt;/span&gt;&lt;span class="p"&gt;...&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;         &lt;span class="mi"&gt;1&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;
&lt;span class="o"&gt;|&lt;/span&gt;&lt;span class="mi"&gt;1004139&lt;/span&gt;&lt;span class="n"&gt;d05100b9f2&lt;/span&gt;&lt;span class="p"&gt;...&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;         &lt;span class="mi"&gt;1&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;
&lt;span class="o"&gt;+--------------------+----------+&lt;/span&gt;
&lt;span class="n"&gt;only&lt;/span&gt; &lt;span class="n"&gt;showing&lt;/span&gt; &lt;span class="n"&gt;top&lt;/span&gt; &lt;span class="mi"&gt;20&lt;/span&gt; &lt;span class="n"&gt;rows&lt;/span&gt;
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;






&lt;h3&gt;
  
  
  What are the top 10 cities with the most orders?
&lt;/h3&gt;



&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight python"&gt;&lt;code&gt;&lt;span class="kn"&gt;from&lt;/span&gt; &lt;span class="n"&gt;pyspark.sql.functions&lt;/span&gt; &lt;span class="kn"&gt;import&lt;/span&gt; &lt;span class="n"&gt;count&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt; &lt;span class="nb"&gt;sum&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt; &lt;span class="n"&gt;col&lt;/span&gt;

&lt;span class="n"&gt;orders&lt;/span&gt; &lt;span class="o"&gt;=&lt;/span&gt; &lt;span class="n"&gt;spark&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="n"&gt;read&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="nf"&gt;option&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;header&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt; &lt;span class="bp"&gt;True&lt;/span&gt;&lt;span class="p"&gt;).&lt;/span&gt;&lt;span class="nf"&gt;csv&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="sa"&gt;f&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="si"&gt;{&lt;/span&gt;&lt;span class="n"&gt;base_path&lt;/span&gt;&lt;span class="si"&gt;}&lt;/span&gt;&lt;span class="s"&gt;/olist_orders_dataset.csv&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt;
&lt;span class="n"&gt;customers&lt;/span&gt; &lt;span class="o"&gt;=&lt;/span&gt; &lt;span class="n"&gt;spark&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="n"&gt;read&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="nf"&gt;option&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;header&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt; &lt;span class="bp"&gt;True&lt;/span&gt;&lt;span class="p"&gt;).&lt;/span&gt;&lt;span class="nf"&gt;csv&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="sa"&gt;f&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="si"&gt;{&lt;/span&gt;&lt;span class="n"&gt;base_path&lt;/span&gt;&lt;span class="si"&gt;}&lt;/span&gt;&lt;span class="s"&gt;/olist_customers_dataset.csv&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt;
&lt;span class="n"&gt;payments&lt;/span&gt; &lt;span class="o"&gt;=&lt;/span&gt; &lt;span class="n"&gt;spark&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="n"&gt;read&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="nf"&gt;option&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;header&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt; &lt;span class="bp"&gt;True&lt;/span&gt;&lt;span class="p"&gt;).&lt;/span&gt;&lt;span class="nf"&gt;csv&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="sa"&gt;f&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="si"&gt;{&lt;/span&gt;&lt;span class="n"&gt;base_path&lt;/span&gt;&lt;span class="si"&gt;}&lt;/span&gt;&lt;span class="s"&gt;/olist_order_payments_dataset.csv&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt;

&lt;span class="n"&gt;orders_customers_payments&lt;/span&gt; &lt;span class="o"&gt;=&lt;/span&gt; &lt;span class="p"&gt;(&lt;/span&gt;
    &lt;span class="n"&gt;orders&lt;/span&gt;
        &lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="nf"&gt;join&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="n"&gt;payments&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt; &lt;span class="n"&gt;on&lt;/span&gt;&lt;span class="o"&gt;=&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;order_id&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt;
        &lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="nf"&gt;join&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="n"&gt;customers&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt; &lt;span class="n"&gt;on&lt;/span&gt;&lt;span class="o"&gt;=&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;customer_id&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt;
&lt;span class="p"&gt;)&lt;/span&gt;

&lt;span class="p"&gt;(&lt;/span&gt;
    &lt;span class="n"&gt;orders_customers_payments&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="nf"&gt;groupBy&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="s"&gt;customer_city&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt;
    &lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="nf"&gt;agg&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="nf"&gt;count&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="s"&gt;order_id&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="p"&gt;).&lt;/span&gt;&lt;span class="nf"&gt;alias&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="s"&gt;order_count&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="p"&gt;))&lt;/span&gt;
    &lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="nf"&gt;orderBy&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="s"&gt;order_count&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt; &lt;span class="n"&gt;ascending&lt;/span&gt;&lt;span class="o"&gt;=&lt;/span&gt;&lt;span class="bp"&gt;False&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt;
    &lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="nf"&gt;limit&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="mi"&gt;10&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt;
    &lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="nf"&gt;show&lt;/span&gt;&lt;span class="p"&gt;()&lt;/span&gt;
&lt;span class="p"&gt;)&lt;/span&gt;

&lt;span class="c1"&gt;# Output
&lt;/span&gt;&lt;span class="o"&gt;+--------------------+-----------+&lt;/span&gt;
&lt;span class="o"&gt;|&lt;/span&gt;       &lt;span class="n"&gt;customer_city&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;&lt;span class="n"&gt;order_count&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;
&lt;span class="o"&gt;+--------------------+-----------+&lt;/span&gt;
&lt;span class="o"&gt;|&lt;/span&gt;           &lt;span class="n"&gt;sao&lt;/span&gt; &lt;span class="n"&gt;paulo&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;      &lt;span class="mi"&gt;16221&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;
&lt;span class="o"&gt;|&lt;/span&gt;      &lt;span class="n"&gt;rio&lt;/span&gt; &lt;span class="n"&gt;de&lt;/span&gt; &lt;span class="n"&gt;janeiro&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;       &lt;span class="mi"&gt;7207&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;
&lt;span class="o"&gt;|&lt;/span&gt;      &lt;span class="n"&gt;belo&lt;/span&gt; &lt;span class="n"&gt;horizonte&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;       &lt;span class="mi"&gt;2872&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;
&lt;span class="o"&gt;|&lt;/span&gt;            &lt;span class="n"&gt;brasilia&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;       &lt;span class="mi"&gt;2193&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;
&lt;span class="o"&gt;|&lt;/span&gt;            &lt;span class="n"&gt;curitiba&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;       &lt;span class="mi"&gt;1576&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;
&lt;span class="o"&gt;|&lt;/span&gt;            &lt;span class="n"&gt;campinas&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;       &lt;span class="mi"&gt;1515&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;
&lt;span class="o"&gt;|&lt;/span&gt;        &lt;span class="n"&gt;porto&lt;/span&gt; &lt;span class="n"&gt;alegre&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;       &lt;span class="mi"&gt;1418&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;
&lt;span class="o"&gt;|&lt;/span&gt;            &lt;span class="n"&gt;salvador&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;       &lt;span class="mi"&gt;1347&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;
&lt;span class="o"&gt;|&lt;/span&gt;           &lt;span class="n"&gt;guarulhos&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;       &lt;span class="mi"&gt;1250&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;
&lt;span class="o"&gt;|&lt;/span&gt;&lt;span class="n"&gt;sao&lt;/span&gt; &lt;span class="n"&gt;bernardo&lt;/span&gt; &lt;span class="n"&gt;do&lt;/span&gt; &lt;span class="n"&gt;c&lt;/span&gt;&lt;span class="p"&gt;...&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;        &lt;span class="mi"&gt;979&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;
&lt;span class="o"&gt;+--------------------+-----------+&lt;/span&gt;
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;






&lt;h3&gt;
  
  
  What are the top 10 cities with the highest total order value?
&lt;/h3&gt;



&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight python"&gt;&lt;code&gt;&lt;span class="p"&gt;(&lt;/span&gt;
    &lt;span class="n"&gt;orders_customers_payments&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="nf"&gt;groupBy&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="s"&gt;customer_city&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt;
    &lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="nf"&gt;agg&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;
        &lt;span class="nf"&gt;sum&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="nf"&gt;col&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="s"&gt;payment_value&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="p"&gt;).&lt;/span&gt;&lt;span class="nf"&gt;cast&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;DECIMAL(10,2)&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="p"&gt;)).&lt;/span&gt;&lt;span class="nf"&gt;alias&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="s"&gt;total_payments&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt;
    &lt;span class="p"&gt;)&lt;/span&gt;
    &lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="nf"&gt;orderBy&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="s"&gt;total_payments&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt; &lt;span class="n"&gt;ascending&lt;/span&gt;&lt;span class="o"&gt;=&lt;/span&gt;&lt;span class="bp"&gt;False&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt;
    &lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="nf"&gt;limit&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="mi"&gt;10&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt;
    &lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="nf"&gt;show&lt;/span&gt;&lt;span class="p"&gt;()&lt;/span&gt;
&lt;span class="p"&gt;)&lt;/span&gt;

&lt;span class="c1"&gt;# Output
&lt;/span&gt;&lt;span class="o"&gt;+--------------+--------------+&lt;/span&gt;
&lt;span class="o"&gt;|&lt;/span&gt; &lt;span class="n"&gt;customer_city&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;&lt;span class="n"&gt;total_payments&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;
&lt;span class="o"&gt;+--------------+--------------+&lt;/span&gt;
&lt;span class="o"&gt;|&lt;/span&gt;     &lt;span class="n"&gt;sao&lt;/span&gt; &lt;span class="n"&gt;paulo&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;    &lt;span class="mf"&gt;2203373.09&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;
&lt;span class="o"&gt;|&lt;/span&gt;&lt;span class="n"&gt;rio&lt;/span&gt; &lt;span class="n"&gt;de&lt;/span&gt; &lt;span class="n"&gt;janeiro&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;    &lt;span class="mf"&gt;1161927.36&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;
&lt;span class="o"&gt;|&lt;/span&gt;&lt;span class="n"&gt;belo&lt;/span&gt; &lt;span class="n"&gt;horizonte&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;     &lt;span class="mf"&gt;421765.12&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;
&lt;span class="o"&gt;|&lt;/span&gt;      &lt;span class="n"&gt;brasilia&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;     &lt;span class="mf"&gt;354216.78&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;
&lt;span class="o"&gt;|&lt;/span&gt;      &lt;span class="n"&gt;curitiba&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;     &lt;span class="mf"&gt;247392.48&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;
&lt;span class="o"&gt;|&lt;/span&gt;  &lt;span class="n"&gt;porto&lt;/span&gt; &lt;span class="n"&gt;alegre&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;     &lt;span class="mf"&gt;224731.42&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;
&lt;span class="o"&gt;|&lt;/span&gt;      &lt;span class="n"&gt;salvador&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;     &lt;span class="mf"&gt;218071.50&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;
&lt;span class="o"&gt;|&lt;/span&gt;      &lt;span class="n"&gt;campinas&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;     &lt;span class="mf"&gt;216248.43&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;
&lt;span class="o"&gt;|&lt;/span&gt;     &lt;span class="n"&gt;guarulhos&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;     &lt;span class="mf"&gt;165121.99&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;
&lt;span class="o"&gt;|&lt;/span&gt;       &lt;span class="n"&gt;niteroi&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;     &lt;span class="mf"&gt;139996.99&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;
&lt;span class="o"&gt;+--------------+--------------+&lt;/span&gt;
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;h3&gt;
  
  
  What is the evolution of orders per month?
&lt;/h3&gt;



&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight python"&gt;&lt;code&gt;&lt;span class="kn"&gt;from&lt;/span&gt; &lt;span class="n"&gt;pyspark.sql.functions&lt;/span&gt; &lt;span class="kn"&gt;import&lt;/span&gt; &lt;span class="n"&gt;to_date&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt; &lt;span class="n"&gt;date_format&lt;/span&gt;

&lt;span class="n"&gt;orders&lt;/span&gt; &lt;span class="o"&gt;=&lt;/span&gt; &lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="n"&gt;orders&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="nf"&gt;withColumn&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="s"&gt;order_date&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt; &lt;span class="nf"&gt;date_format&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="nf"&gt;col&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="s"&gt;order_purchase_timestamp&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="p"&gt;),&lt;/span&gt; &lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="s"&gt;yyyy-MM&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="p"&gt;)))&lt;/span&gt;

&lt;span class="p"&gt;(&lt;/span&gt;
    &lt;span class="n"&gt;orders&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="nf"&gt;groupBy&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="s"&gt;order_date&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt;
    &lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="nf"&gt;agg&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;
        &lt;span class="nf"&gt;count&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="s"&gt;order_id&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="p"&gt;).&lt;/span&gt;&lt;span class="nf"&gt;alias&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="s"&gt;order_count&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt;
    &lt;span class="p"&gt;).&lt;/span&gt;&lt;span class="nf"&gt;orderBy&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="s"&gt;order_date&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt; &lt;span class="n"&gt;ascending&lt;/span&gt;&lt;span class="o"&gt;=&lt;/span&gt;&lt;span class="bp"&gt;False&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt;
&lt;span class="p"&gt;).&lt;/span&gt;&lt;span class="nf"&gt;show&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="mi"&gt;10&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt;

&lt;span class="c1"&gt;# Output
&lt;/span&gt;&lt;span class="o"&gt;+----------+-----------+&lt;/span&gt;
&lt;span class="o"&gt;|&lt;/span&gt;&lt;span class="n"&gt;order_date&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;&lt;span class="n"&gt;order_count&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;
&lt;span class="o"&gt;+----------+-----------+&lt;/span&gt;
&lt;span class="o"&gt;|&lt;/span&gt;   &lt;span class="mi"&gt;2018&lt;/span&gt;&lt;span class="o"&gt;-&lt;/span&gt;&lt;span class="mi"&gt;10&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;          &lt;span class="mi"&gt;4&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;
&lt;span class="o"&gt;|&lt;/span&gt;   &lt;span class="mi"&gt;2018&lt;/span&gt;&lt;span class="o"&gt;-&lt;/span&gt;&lt;span class="mi"&gt;09&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;         &lt;span class="mi"&gt;16&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;
&lt;span class="o"&gt;|&lt;/span&gt;   &lt;span class="mi"&gt;2018&lt;/span&gt;&lt;span class="o"&gt;-&lt;/span&gt;&lt;span class="mi"&gt;08&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;       &lt;span class="mi"&gt;6512&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;
&lt;span class="o"&gt;|&lt;/span&gt;   &lt;span class="mi"&gt;2018&lt;/span&gt;&lt;span class="o"&gt;-&lt;/span&gt;&lt;span class="mi"&gt;07&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;       &lt;span class="mi"&gt;6292&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;
&lt;span class="o"&gt;|&lt;/span&gt;   &lt;span class="mi"&gt;2018&lt;/span&gt;&lt;span class="o"&gt;-&lt;/span&gt;&lt;span class="mi"&gt;06&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;       &lt;span class="mi"&gt;6167&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;
&lt;span class="o"&gt;|&lt;/span&gt;   &lt;span class="mi"&gt;2018&lt;/span&gt;&lt;span class="o"&gt;-&lt;/span&gt;&lt;span class="mi"&gt;05&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;       &lt;span class="mi"&gt;6873&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;
&lt;span class="o"&gt;|&lt;/span&gt;   &lt;span class="mi"&gt;2018&lt;/span&gt;&lt;span class="o"&gt;-&lt;/span&gt;&lt;span class="mi"&gt;04&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;       &lt;span class="mi"&gt;6939&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;
&lt;span class="o"&gt;|&lt;/span&gt;   &lt;span class="mi"&gt;2018&lt;/span&gt;&lt;span class="o"&gt;-&lt;/span&gt;&lt;span class="mi"&gt;03&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;       &lt;span class="mi"&gt;7211&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;
&lt;span class="o"&gt;|&lt;/span&gt;   &lt;span class="mi"&gt;2018&lt;/span&gt;&lt;span class="o"&gt;-&lt;/span&gt;&lt;span class="mi"&gt;02&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;       &lt;span class="mi"&gt;6728&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;
&lt;span class="o"&gt;|&lt;/span&gt;   &lt;span class="mi"&gt;2018&lt;/span&gt;&lt;span class="o"&gt;-&lt;/span&gt;&lt;span class="mi"&gt;01&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;       &lt;span class="mi"&gt;7269&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;
&lt;span class="o"&gt;+----------+-----------+&lt;/span&gt;
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;





&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight python"&gt;&lt;code&gt;&lt;span class="n"&gt;order_payments&lt;/span&gt; &lt;span class="o"&gt;=&lt;/span&gt; &lt;span class="n"&gt;orders&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="nf"&gt;join&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="n"&gt;payments&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt; &lt;span class="n"&gt;on&lt;/span&gt;&lt;span class="o"&gt;=&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;order_id&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt;

&lt;span class="p"&gt;(&lt;/span&gt;
    &lt;span class="n"&gt;order_payments&lt;/span&gt;
        &lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="nf"&gt;groupBy&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;order_date&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt;
        &lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="nf"&gt;agg&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;
            &lt;span class="nf"&gt;count&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="s"&gt;order_id&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="p"&gt;).&lt;/span&gt;&lt;span class="nf"&gt;alias&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="s"&gt;total_orders&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="p"&gt;),&lt;/span&gt;
            &lt;span class="nf"&gt;sum&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="nf"&gt;col&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="s"&gt;payment_value&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="p"&gt;).&lt;/span&gt;&lt;span class="nf"&gt;cast&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;DECIMAL(10,2)&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="p"&gt;)).&lt;/span&gt;&lt;span class="nf"&gt;alias&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="s"&gt;total_order_value&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt;
        &lt;span class="p"&gt;)&lt;/span&gt;
        &lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="nf"&gt;orderBy&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="s"&gt;total_order_value&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt; &lt;span class="n"&gt;ascending&lt;/span&gt;&lt;span class="o"&gt;=&lt;/span&gt;&lt;span class="bp"&gt;False&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt;
        &lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="nf"&gt;show&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="mi"&gt;10&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt;
&lt;span class="p"&gt;)&lt;/span&gt;

&lt;span class="c1"&gt;# Output
&lt;/span&gt;&lt;span class="o"&gt;+----------+------------+-----------------+&lt;/span&gt;
&lt;span class="o"&gt;|&lt;/span&gt;&lt;span class="n"&gt;order_date&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;&lt;span class="n"&gt;total_orders&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;&lt;span class="n"&gt;total_order_value&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;
&lt;span class="o"&gt;+----------+------------+-----------------+&lt;/span&gt;
&lt;span class="o"&gt;|&lt;/span&gt;   &lt;span class="mi"&gt;2017&lt;/span&gt;&lt;span class="o"&gt;-&lt;/span&gt;&lt;span class="mi"&gt;11&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;        &lt;span class="mi"&gt;7863&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;       &lt;span class="mf"&gt;1194882.80&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;
&lt;span class="o"&gt;|&lt;/span&gt;   &lt;span class="mi"&gt;2018&lt;/span&gt;&lt;span class="o"&gt;-&lt;/span&gt;&lt;span class="mi"&gt;04&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;        &lt;span class="mi"&gt;7209&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;       &lt;span class="mf"&gt;1160785.48&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;
&lt;span class="o"&gt;|&lt;/span&gt;   &lt;span class="mi"&gt;2018&lt;/span&gt;&lt;span class="o"&gt;-&lt;/span&gt;&lt;span class="mi"&gt;03&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;        &lt;span class="mi"&gt;7512&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;       &lt;span class="mf"&gt;1159652.12&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;
&lt;span class="o"&gt;|&lt;/span&gt;   &lt;span class="mi"&gt;2018&lt;/span&gt;&lt;span class="o"&gt;-&lt;/span&gt;&lt;span class="mi"&gt;05&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;        &lt;span class="mi"&gt;7135&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;       &lt;span class="mf"&gt;1153982.15&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;
&lt;span class="o"&gt;|&lt;/span&gt;   &lt;span class="mi"&gt;2018&lt;/span&gt;&lt;span class="o"&gt;-&lt;/span&gt;&lt;span class="mi"&gt;01&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;        &lt;span class="mi"&gt;7563&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;       &lt;span class="mf"&gt;1115004.18&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;
&lt;span class="o"&gt;|&lt;/span&gt;   &lt;span class="mi"&gt;2018&lt;/span&gt;&lt;span class="o"&gt;-&lt;/span&gt;&lt;span class="mi"&gt;07&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;        &lt;span class="mi"&gt;6507&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;       &lt;span class="mf"&gt;1066540.75&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;
&lt;span class="o"&gt;|&lt;/span&gt;   &lt;span class="mi"&gt;2018&lt;/span&gt;&lt;span class="o"&gt;-&lt;/span&gt;&lt;span class="mi"&gt;06&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;        &lt;span class="mi"&gt;6419&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;       &lt;span class="mf"&gt;1023880.50&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;
&lt;span class="o"&gt;|&lt;/span&gt;   &lt;span class="mi"&gt;2018&lt;/span&gt;&lt;span class="o"&gt;-&lt;/span&gt;&lt;span class="mi"&gt;08&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;        &lt;span class="mi"&gt;6698&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;       &lt;span class="mf"&gt;1022425.32&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;
&lt;span class="o"&gt;|&lt;/span&gt;   &lt;span class="mi"&gt;2018&lt;/span&gt;&lt;span class="o"&gt;-&lt;/span&gt;&lt;span class="mi"&gt;02&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;        &lt;span class="mi"&gt;6952&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;        &lt;span class="mf"&gt;992463.34&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;
&lt;span class="o"&gt;|&lt;/span&gt;   &lt;span class="mi"&gt;2017&lt;/span&gt;&lt;span class="o"&gt;-&lt;/span&gt;&lt;span class="mi"&gt;12&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;        &lt;span class="mi"&gt;5895&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;        &lt;span class="mf"&gt;878401.48&lt;/span&gt;&lt;span class="o"&gt;|&lt;/span&gt;
&lt;span class="o"&gt;+----------+------------+-----------------+&lt;/span&gt;
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



</description>
      <category>spark</category>
      <category>databricks</category>
      <category>analytics</category>
      <category>kaggle</category>
    </item>
    <item>
      <title>Exploring the Netflix TV Shows and Movies Dataset with Spark</title>
      <dc:creator>Jackson Freitas</dc:creator>
      <pubDate>Mon, 13 Oct 2025 12:49:14 +0000</pubDate>
      <link>https://forem.com/jackson_freitas_42cbeaf74/exploring-the-netflix-tv-shows-and-movies-dataset-with-spark-5fdk</link>
      <guid>https://forem.com/jackson_freitas_42cbeaf74/exploring-the-netflix-tv-shows-and-movies-dataset-with-spark-5fdk</guid>
      <description>&lt;h1&gt;
  
  
  🎬 Exploring the Netflix TV Shows and Movies Dataset with Spark
&lt;/h1&gt;

&lt;p&gt;The &lt;strong&gt;"Netflix TV Shows and Movies"&lt;/strong&gt; dataset contains information about movies and TV shows — including title, country, rating, release year, duration, and more.&lt;/p&gt;

&lt;p&gt;&lt;strong&gt;Goal:&lt;/strong&gt; Use &lt;strong&gt;Apache Spark&lt;/strong&gt; to read and analyze this dataset, generating some useful insights and metrics.&lt;/p&gt;

&lt;p&gt;&lt;strong&gt;Source:&lt;/strong&gt; &lt;a href="https://www.kaggle.com/datasets/victorsoeiro/netflix-tv-shows-and-movies" rel="noopener noreferrer"&gt;Dataset on Kaggle&lt;/a&gt;&lt;br&gt;
&lt;strong&gt;Environment:&lt;/strong&gt; Databricks Free Edition Notebook&lt;/p&gt;




&lt;h2&gt;
  
  
  ⚙️ 1. Install Kaggle and Restart Python
&lt;/h2&gt;



&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight python"&gt;&lt;code&gt;&lt;span class="o"&gt;%&lt;/span&gt;&lt;span class="n"&gt;pip&lt;/span&gt; &lt;span class="n"&gt;install&lt;/span&gt; &lt;span class="n"&gt;kaggle&lt;/span&gt;
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;





&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight python"&gt;&lt;code&gt;&lt;span class="o"&gt;%&lt;/span&gt;&lt;span class="n"&gt;restart_python&lt;/span&gt;
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;






&lt;h2&gt;
  
  
  📥 2. Download the Dataset into Volumes
&lt;/h2&gt;



&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight python"&gt;&lt;code&gt;&lt;span class="kn"&gt;import&lt;/span&gt; &lt;span class="n"&gt;os&lt;/span&gt;
&lt;span class="kn"&gt;import&lt;/span&gt; &lt;span class="n"&gt;json&lt;/span&gt;

&lt;span class="n"&gt;kaggle_token&lt;/span&gt; &lt;span class="o"&gt;=&lt;/span&gt; &lt;span class="n"&gt;dbutils&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="n"&gt;secrets&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="nf"&gt;get&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="s"&gt;kaggle&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt; &lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="s"&gt;kaggle-token&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt;
&lt;span class="n"&gt;kaggle_creds&lt;/span&gt; &lt;span class="o"&gt;=&lt;/span&gt; &lt;span class="n"&gt;json&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="nf"&gt;loads&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="n"&gt;kaggle_token&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt;

&lt;span class="n"&gt;kaggle_dir&lt;/span&gt; &lt;span class="o"&gt;=&lt;/span&gt; &lt;span class="n"&gt;os&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="n"&gt;path&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="nf"&gt;expanduser&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="s"&gt;~/.config/kaggle&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt;
&lt;span class="n"&gt;os&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="nf"&gt;makedirs&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="n"&gt;kaggle_dir&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt; &lt;span class="n"&gt;exist_ok&lt;/span&gt;&lt;span class="o"&gt;=&lt;/span&gt;&lt;span class="bp"&gt;True&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt;
&lt;span class="n"&gt;kaggle_json_path&lt;/span&gt; &lt;span class="o"&gt;=&lt;/span&gt; &lt;span class="n"&gt;os&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="n"&gt;path&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="nf"&gt;join&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="n"&gt;kaggle_dir&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt; &lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="s"&gt;kaggle.json&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt;

&lt;span class="c1"&gt;# Write the token to kaggle.json
&lt;/span&gt;&lt;span class="k"&gt;with&lt;/span&gt; &lt;span class="nf"&gt;open&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="n"&gt;kaggle_json_path&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt; &lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="s"&gt;w&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt; &lt;span class="k"&gt;as&lt;/span&gt; &lt;span class="n"&gt;f&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt;
    &lt;span class="n"&gt;f&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="nf"&gt;write&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="n"&gt;kaggle_token&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt;

&lt;span class="kn"&gt;from&lt;/span&gt; &lt;span class="n"&gt;kaggle.api.kaggle_api_extended&lt;/span&gt; &lt;span class="kn"&gt;import&lt;/span&gt; &lt;span class="n"&gt;KaggleApi&lt;/span&gt;
&lt;span class="n"&gt;api&lt;/span&gt; &lt;span class="o"&gt;=&lt;/span&gt; &lt;span class="nc"&gt;KaggleApi&lt;/span&gt;&lt;span class="p"&gt;()&lt;/span&gt;
&lt;span class="n"&gt;api&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="nf"&gt;authenticate&lt;/span&gt;&lt;span class="p"&gt;()&lt;/span&gt;

&lt;span class="n"&gt;dataset_identifier&lt;/span&gt; &lt;span class="o"&gt;=&lt;/span&gt; &lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;shivamb/netflix-shows&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;
&lt;span class="n"&gt;volume_path&lt;/span&gt; &lt;span class="o"&gt;=&lt;/span&gt; &lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;/Volumes/dev/raw/netflix/&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;

&lt;span class="n"&gt;api&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="nf"&gt;dataset_download_files&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="n"&gt;dataset_identifier&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt; &lt;span class="n"&gt;path&lt;/span&gt;&lt;span class="o"&gt;=&lt;/span&gt;&lt;span class="n"&gt;volume_path&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt; &lt;span class="n"&gt;unzip&lt;/span&gt;&lt;span class="o"&gt;=&lt;/span&gt;&lt;span class="bp"&gt;True&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt;
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;





&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight sql"&gt;&lt;code&gt;&lt;span class="c1"&gt;-- Preparing catalog&lt;/span&gt;
&lt;span class="k"&gt;CREATE&lt;/span&gt; &lt;span class="k"&gt;CATALOG&lt;/span&gt; &lt;span class="n"&gt;IF&lt;/span&gt; &lt;span class="k"&gt;NOT&lt;/span&gt; &lt;span class="k"&gt;EXISTS&lt;/span&gt; &lt;span class="n"&gt;dev&lt;/span&gt;&lt;span class="p"&gt;;&lt;/span&gt;
&lt;span class="n"&gt;USE&lt;/span&gt; &lt;span class="k"&gt;CATALOG&lt;/span&gt; &lt;span class="n"&gt;dev&lt;/span&gt;&lt;span class="p"&gt;;&lt;/span&gt;
&lt;span class="k"&gt;CREATE&lt;/span&gt; &lt;span class="k"&gt;SCHEMA&lt;/span&gt; &lt;span class="n"&gt;IF&lt;/span&gt; &lt;span class="k"&gt;NOT&lt;/span&gt; &lt;span class="k"&gt;EXISTS&lt;/span&gt; &lt;span class="n"&gt;dev&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="n"&gt;raw&lt;/span&gt;&lt;span class="p"&gt;;&lt;/span&gt;
&lt;span class="k"&gt;CREATE&lt;/span&gt; &lt;span class="n"&gt;VOLUME&lt;/span&gt; &lt;span class="n"&gt;IF&lt;/span&gt; &lt;span class="k"&gt;NOT&lt;/span&gt; &lt;span class="k"&gt;EXISTS&lt;/span&gt; &lt;span class="n"&gt;dev&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="n"&gt;raw&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="n"&gt;netflix&lt;/span&gt;&lt;span class="p"&gt;;&lt;/span&gt;
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;






&lt;h2&gt;
  
  
  📂 3. Read the CSV File and Infer the Schema
&lt;/h2&gt;



&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight python"&gt;&lt;code&gt;&lt;span class="kn"&gt;from&lt;/span&gt; &lt;span class="n"&gt;pyspark.sql.types&lt;/span&gt; &lt;span class="kn"&gt;import&lt;/span&gt; &lt;span class="n"&gt;StructType&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt; &lt;span class="n"&gt;StringType&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt; &lt;span class="n"&gt;IntegerType&lt;/span&gt;
&lt;span class="kn"&gt;from&lt;/span&gt; &lt;span class="n"&gt;pyspark.sql.functions&lt;/span&gt; &lt;span class="kn"&gt;import&lt;/span&gt; &lt;span class="n"&gt;to_date&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt; &lt;span class="n"&gt;trim&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt; &lt;span class="n"&gt;col&lt;/span&gt;

&lt;span class="n"&gt;schema&lt;/span&gt; &lt;span class="o"&gt;=&lt;/span&gt; &lt;span class="nc"&gt;StructType&lt;/span&gt;&lt;span class="p"&gt;()&lt;/span&gt; \
    &lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="nf"&gt;add&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;show_id&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt; &lt;span class="nc"&gt;StringType&lt;/span&gt;&lt;span class="p"&gt;())&lt;/span&gt; \
    &lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="nf"&gt;add&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;type&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt; &lt;span class="nc"&gt;StringType&lt;/span&gt;&lt;span class="p"&gt;())&lt;/span&gt; \
    &lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="nf"&gt;add&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;title&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt; &lt;span class="nc"&gt;StringType&lt;/span&gt;&lt;span class="p"&gt;())&lt;/span&gt; \
    &lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="nf"&gt;add&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;director&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt; &lt;span class="nc"&gt;StringType&lt;/span&gt;&lt;span class="p"&gt;())&lt;/span&gt; \
    &lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="nf"&gt;add&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;cast&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt; &lt;span class="nc"&gt;StringType&lt;/span&gt;&lt;span class="p"&gt;())&lt;/span&gt; \
    &lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="nf"&gt;add&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;country&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt; &lt;span class="nc"&gt;StringType&lt;/span&gt;&lt;span class="p"&gt;())&lt;/span&gt; \
    &lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="nf"&gt;add&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;date_added&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt; &lt;span class="nc"&gt;StringType&lt;/span&gt;&lt;span class="p"&gt;())&lt;/span&gt; \
    &lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="nf"&gt;add&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;release_year&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt; &lt;span class="nc"&gt;IntegerType&lt;/span&gt;&lt;span class="p"&gt;())&lt;/span&gt; \
    &lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="nf"&gt;add&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;rating&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt; &lt;span class="nc"&gt;StringType&lt;/span&gt;&lt;span class="p"&gt;())&lt;/span&gt; \
    &lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="nf"&gt;add&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;duration&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt; &lt;span class="nc"&gt;StringType&lt;/span&gt;&lt;span class="p"&gt;())&lt;/span&gt; \
    &lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="nf"&gt;add&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;listed_in&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt; &lt;span class="nc"&gt;StringType&lt;/span&gt;&lt;span class="p"&gt;())&lt;/span&gt; \
    &lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="nf"&gt;add&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;description&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt; &lt;span class="nc"&gt;StringType&lt;/span&gt;&lt;span class="p"&gt;())&lt;/span&gt;

&lt;span class="n"&gt;df&lt;/span&gt; &lt;span class="o"&gt;=&lt;/span&gt; &lt;span class="n"&gt;spark&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="n"&gt;read&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="nf"&gt;format&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="s"&gt;csv&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt; \
    &lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="nf"&gt;options&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="n"&gt;header&lt;/span&gt;&lt;span class="o"&gt;=&lt;/span&gt;&lt;span class="bp"&gt;True&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt; &lt;span class="n"&gt;quote&lt;/span&gt;&lt;span class="o"&gt;=&lt;/span&gt;&lt;span class="sh"&gt;'"'&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt; &lt;span class="n"&gt;escape&lt;/span&gt;&lt;span class="o"&gt;=&lt;/span&gt;&lt;span class="sh"&gt;'"'&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt; &lt;span class="n"&gt;multiline&lt;/span&gt;&lt;span class="o"&gt;=&lt;/span&gt;&lt;span class="bp"&gt;True&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt; &lt;span class="n"&gt;mode&lt;/span&gt;&lt;span class="o"&gt;=&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;PERMISSIVE&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt; \
    &lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="nf"&gt;schema&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="n"&gt;schema&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt; \
    &lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="nf"&gt;option&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;ignoreLeadingWhiteSpace&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt; &lt;span class="bp"&gt;True&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt; \
    &lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="nf"&gt;option&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;ignoreTrailingWhiteSpace&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt; &lt;span class="bp"&gt;True&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt; \
    &lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="nf"&gt;load&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;/Volumes/dev/raw/netflix/netflix_titles.csv&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt;

&lt;span class="n"&gt;df&lt;/span&gt; &lt;span class="o"&gt;=&lt;/span&gt; &lt;span class="n"&gt;df&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="nf"&gt;withColumn&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="s"&gt;date_added&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt; &lt;span class="nf"&gt;to_date&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="nf"&gt;trim&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="nf"&gt;col&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="s"&gt;date_added&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="p"&gt;)),&lt;/span&gt; &lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="s"&gt;MMMM d, yyyy&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="p"&gt;))&lt;/span&gt;
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;blockquote&gt;
&lt;p&gt;💡 &lt;strong&gt;Note:&lt;/strong&gt;&lt;br&gt;
Some options were added to prevent Spark from parsing the data incorrectly.&lt;br&gt;
In this case, quotes and extra spaces were affecting how the data was distributed across columns.&lt;br&gt;
The &lt;code&gt;date_added&lt;/code&gt; column was also converted to a proper date type.&lt;/p&gt;
&lt;/blockquote&gt;




&lt;h2&gt;
  
  
  📊 4. View Statistics and Data Quality
&lt;/h2&gt;



&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight python"&gt;&lt;code&gt;&lt;span class="kn"&gt;from&lt;/span&gt; &lt;span class="n"&gt;pyspark.sql.functions&lt;/span&gt; &lt;span class="kn"&gt;import&lt;/span&gt; &lt;span class="n"&gt;col&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt; &lt;span class="n"&gt;when&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt; &lt;span class="n"&gt;count&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt; &lt;span class="n"&gt;countDistinct&lt;/span&gt;

&lt;span class="c1"&gt;# Count null values per column
&lt;/span&gt;&lt;span class="n"&gt;df_nulls&lt;/span&gt; &lt;span class="o"&gt;=&lt;/span&gt; &lt;span class="n"&gt;df&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="nf"&gt;select&lt;/span&gt;&lt;span class="p"&gt;([&lt;/span&gt;
    &lt;span class="nf"&gt;count&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="nf"&gt;when&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="nf"&gt;col&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="n"&gt;c&lt;/span&gt;&lt;span class="p"&gt;).&lt;/span&gt;&lt;span class="nf"&gt;isNull&lt;/span&gt;&lt;span class="p"&gt;()&lt;/span&gt; &lt;span class="o"&gt;|&lt;/span&gt; &lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="nf"&gt;col&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="n"&gt;c&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt; &lt;span class="o"&gt;==&lt;/span&gt; &lt;span class="sh"&gt;""&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt; &lt;span class="o"&gt;|&lt;/span&gt; &lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="nf"&gt;col&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="n"&gt;c&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt; &lt;span class="o"&gt;==&lt;/span&gt; &lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt; &lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="p"&gt;),&lt;/span&gt; &lt;span class="n"&gt;c&lt;/span&gt;&lt;span class="p"&gt;)).&lt;/span&gt;&lt;span class="nf"&gt;alias&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="n"&gt;c&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt;
    &lt;span class="k"&gt;for&lt;/span&gt; &lt;span class="n"&gt;c&lt;/span&gt; &lt;span class="ow"&gt;in&lt;/span&gt; &lt;span class="n"&gt;df&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="n"&gt;columns&lt;/span&gt;
&lt;span class="p"&gt;])&lt;/span&gt;

&lt;span class="c1"&gt;# Count distinct values
&lt;/span&gt;&lt;span class="nf"&gt;display&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="n"&gt;df&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="nf"&gt;select&lt;/span&gt;&lt;span class="p"&gt;([&lt;/span&gt;&lt;span class="nf"&gt;countDistinct&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="nf"&gt;col&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="n"&gt;c&lt;/span&gt;&lt;span class="p"&gt;)).&lt;/span&gt;&lt;span class="nf"&gt;alias&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="n"&gt;c&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt; &lt;span class="k"&gt;for&lt;/span&gt; &lt;span class="n"&gt;c&lt;/span&gt; &lt;span class="ow"&gt;in&lt;/span&gt; &lt;span class="n"&gt;df&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="n"&gt;columns&lt;/span&gt;&lt;span class="p"&gt;]))&lt;/span&gt;

&lt;span class="c1"&gt;# Count duplicated records
&lt;/span&gt;&lt;span class="nf"&gt;print&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="sa"&gt;f&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;Duplicate records: &lt;/span&gt;&lt;span class="si"&gt;{&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="n"&gt;df&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="nf"&gt;count&lt;/span&gt;&lt;span class="p"&gt;()&lt;/span&gt; &lt;span class="o"&gt;-&lt;/span&gt; &lt;span class="n"&gt;df&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="nf"&gt;dropDuplicates&lt;/span&gt;&lt;span class="p"&gt;().&lt;/span&gt;&lt;span class="nf"&gt;count&lt;/span&gt;&lt;span class="p"&gt;())&lt;/span&gt;&lt;span class="si"&gt;}&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt;

&lt;span class="c1"&gt;# General statistics
&lt;/span&gt;&lt;span class="nf"&gt;display&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="n"&gt;df&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="nf"&gt;describe&lt;/span&gt;&lt;span class="p"&gt;())&lt;/span&gt;

&lt;span class="c1"&gt;# Show null counts
&lt;/span&gt;&lt;span class="nf"&gt;display&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="n"&gt;df_nulls&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt;

&lt;span class="c1"&gt;# View inferred schema
&lt;/span&gt;&lt;span class="n"&gt;df&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="nf"&gt;printSchema&lt;/span&gt;&lt;span class="p"&gt;()&lt;/span&gt;

&lt;span class="c1"&gt;# Show a random sample of the dataset
&lt;/span&gt;&lt;span class="nf"&gt;display&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="n"&gt;df&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="nf"&gt;sample&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="n"&gt;withReplacement&lt;/span&gt;&lt;span class="o"&gt;=&lt;/span&gt;&lt;span class="bp"&gt;False&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt; &lt;span class="n"&gt;fraction&lt;/span&gt;&lt;span class="o"&gt;=&lt;/span&gt;&lt;span class="mf"&gt;0.01&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt; &lt;span class="n"&gt;seed&lt;/span&gt;&lt;span class="o"&gt;=&lt;/span&gt;&lt;span class="mi"&gt;42&lt;/span&gt;&lt;span class="p"&gt;).&lt;/span&gt;&lt;span class="nf"&gt;limit&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="mi"&gt;10&lt;/span&gt;&lt;span class="p"&gt;))&lt;/span&gt;
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;






&lt;h2&gt;
  
  
  🔍 5. Analysis
&lt;/h2&gt;

&lt;h3&gt;
  
  
  a) How many movies and TV shows are in the catalog?
&lt;/h3&gt;



&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight python"&gt;&lt;code&gt;&lt;span class="n"&gt;value&lt;/span&gt; &lt;span class="o"&gt;=&lt;/span&gt; &lt;span class="n"&gt;df&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="nf"&gt;groupBy&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;type&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="p"&gt;).&lt;/span&gt;&lt;span class="nf"&gt;count&lt;/span&gt;&lt;span class="p"&gt;()&lt;/span&gt;
&lt;span class="nf"&gt;display&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="n"&gt;value&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt;
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;






&lt;h3&gt;
  
  
  b) How are the contents distributed by rating?
&lt;/h3&gt;



&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight python"&gt;&lt;code&gt;&lt;span class="n"&gt;value&lt;/span&gt; &lt;span class="o"&gt;=&lt;/span&gt; &lt;span class="n"&gt;df&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="nf"&gt;groupBy&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;rating&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="p"&gt;).&lt;/span&gt;&lt;span class="nf"&gt;count&lt;/span&gt;&lt;span class="p"&gt;()&lt;/span&gt;
&lt;span class="nf"&gt;display&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="n"&gt;value&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt;
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;






&lt;h3&gt;
  
  
  c) What is the average duration of movies?
&lt;/h3&gt;



&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight python"&gt;&lt;code&gt;&lt;span class="kn"&gt;from&lt;/span&gt; &lt;span class="n"&gt;pyspark.sql.functions&lt;/span&gt; &lt;span class="kn"&gt;import&lt;/span&gt; &lt;span class="n"&gt;col&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt; &lt;span class="n"&gt;regexp_extract&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt; &lt;span class="n"&gt;avg&lt;/span&gt;

&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="n"&gt;df&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="nf"&gt;filter&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="nf"&gt;col&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="s"&gt;type&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt; &lt;span class="o"&gt;==&lt;/span&gt; &lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="s"&gt;Movie&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt;
    &lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="nf"&gt;select&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;
        &lt;span class="nf"&gt;regexp_extract&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="nf"&gt;col&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="s"&gt;duration&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="p"&gt;),&lt;/span&gt; &lt;span class="sa"&gt;r&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="s"&gt;(\d+)&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt; &lt;span class="mi"&gt;1&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt;
        &lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="nf"&gt;cast&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="s"&gt;int&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt;
        &lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="nf"&gt;alias&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="s"&gt;duration&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt;
    &lt;span class="p"&gt;)&lt;/span&gt;
    &lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="nf"&gt;agg&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;
        &lt;span class="nf"&gt;avg&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="s"&gt;duration&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt;
        &lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="nf"&gt;cast&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="s"&gt;decimal(10,2)&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt;
        &lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="nf"&gt;alias&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="s"&gt;average_duration&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt;
    &lt;span class="p"&gt;)&lt;/span&gt;
    &lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="nf"&gt;show&lt;/span&gt;&lt;span class="p"&gt;())&lt;/span&gt;
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



</description>
      <category>analytics</category>
      <category>netflix</category>
      <category>spark</category>
      <category>databricks</category>
    </item>
  </channel>
</rss>
