<?xml version="1.0" encoding="UTF-8"?>
<rss version="2.0" xmlns:atom="http://www.w3.org/2005/Atom" xmlns:dc="http://purl.org/dc/elements/1.1/">
  <channel>
    <title>Forem: allan-pg</title>
    <description>The latest articles on Forem by allan-pg (@allan-pg).</description>
    <link>https://forem.com/allan-pg</link>
    <image>
      <url>https://media2.dev.to/dynamic/image/width=90,height=90,fit=cover,gravity=auto,format=auto/https:%2F%2Fdev-to-uploads.s3.amazonaws.com%2Fuploads%2Fuser%2Fprofile_image%2F895054%2F222ce658-bd10-4166-83f0-d8edd89f535d.png</url>
      <title>Forem: allan-pg</title>
      <link>https://forem.com/allan-pg</link>
    </image>
    <atom:link rel="self" type="application/rss+xml" href="https://forem.com/feed/allan-pg"/>
    <language>en</language>
    <item>
      <title>[Boost]</title>
      <dc:creator>allan-pg</dc:creator>
      <pubDate>Sun, 08 Mar 2026 22:27:16 +0000</pubDate>
      <link>https://forem.com/allan-pg/-530e</link>
      <guid>https://forem.com/allan-pg/-530e</guid>
      <description>&lt;div class="ltag__link--embedded"&gt;
  &lt;div class="crayons-story "&gt;
  &lt;a href="https://dev.to/allan-pg/unlocking-data-warehouse-magic-kimball-vs-inmon-vs-the-ultimate-hybrid-hack-1g7o" class="crayons-story__hidden-navigation-link"&gt;Designing a Modern Data Warehouse: Combining Bill Inmon and Ralph Kimball in a Hybrid Medallion Architecture&lt;/a&gt;


  &lt;div class="crayons-story__body crayons-story__body-full_post"&gt;
    &lt;div class="crayons-story__top"&gt;
      &lt;div class="crayons-story__meta"&gt;
        &lt;div class="crayons-story__author-pic"&gt;

          &lt;a href="/allan-pg" class="crayons-avatar  crayons-avatar--l  "&gt;
            &lt;img src="https://media2.dev.to/dynamic/image/width=800%2Cheight=%2Cfit=scale-down%2Cgravity=auto%2Cformat=auto/https%3A%2F%2Fdev-to-uploads.s3.amazonaws.com%2Fuploads%2Fuser%2Fprofile_image%2F895054%2F222ce658-bd10-4166-83f0-d8edd89f535d.png" alt="allan-pg profile" class="crayons-avatar__image"&gt;
          &lt;/a&gt;
        &lt;/div&gt;
        &lt;div&gt;
          &lt;div&gt;
            &lt;a href="/allan-pg" class="crayons-story__secondary fw-medium m:hidden"&gt;
              allan-pg
            &lt;/a&gt;
            &lt;div class="profile-preview-card relative mb-4 s:mb-0 fw-medium hidden m:inline-block"&gt;
              
                allan-pg
                
              
              &lt;div id="story-author-preview-content-3308454" class="profile-preview-card__content crayons-dropdown branded-7 p-4 pt-0"&gt;
                &lt;div class="gap-4 grid"&gt;
                  &lt;div class="-mt-4"&gt;
                    &lt;a href="/allan-pg" class="flex"&gt;
                      &lt;span class="crayons-avatar crayons-avatar--xl mr-2 shrink-0"&gt;
                        &lt;img src="https://media2.dev.to/dynamic/image/width=800%2Cheight=%2Cfit=scale-down%2Cgravity=auto%2Cformat=auto/https%3A%2F%2Fdev-to-uploads.s3.amazonaws.com%2Fuploads%2Fuser%2Fprofile_image%2F895054%2F222ce658-bd10-4166-83f0-d8edd89f535d.png" class="crayons-avatar__image" alt=""&gt;
                      &lt;/span&gt;
                      &lt;span class="crayons-link crayons-subtitle-2 mt-5"&gt;allan-pg&lt;/span&gt;
                    &lt;/a&gt;
                  &lt;/div&gt;
                  &lt;div class="print-hidden"&gt;
                    
                      Follow
                    
                  &lt;/div&gt;
                  &lt;div class="author-preview-metadata-container"&gt;&lt;/div&gt;
                &lt;/div&gt;
              &lt;/div&gt;
            &lt;/div&gt;

          &lt;/div&gt;
          &lt;a href="https://dev.to/allan-pg/unlocking-data-warehouse-magic-kimball-vs-inmon-vs-the-ultimate-hybrid-hack-1g7o" class="crayons-story__tertiary fs-xs"&gt;&lt;time&gt;Mar 4&lt;/time&gt;&lt;span class="time-ago-indicator-initial-placeholder"&gt;&lt;/span&gt;&lt;/a&gt;
        &lt;/div&gt;
      &lt;/div&gt;

    &lt;/div&gt;

    &lt;div class="crayons-story__indention"&gt;
      &lt;h2 class="crayons-story__title crayons-story__title-full_post"&gt;
        &lt;a href="https://dev.to/allan-pg/unlocking-data-warehouse-magic-kimball-vs-inmon-vs-the-ultimate-hybrid-hack-1g7o" id="article-link-3308454"&gt;
          Designing a Modern Data Warehouse: Combining Bill Inmon and Ralph Kimball in a Hybrid Medallion Architecture
        &lt;/a&gt;
      &lt;/h2&gt;
        &lt;div class="crayons-story__tags"&gt;
            &lt;a class="crayons-tag  crayons-tag--monochrome " href="/t/database"&gt;&lt;span class="crayons-tag__prefix"&gt;#&lt;/span&gt;database&lt;/a&gt;
            &lt;a class="crayons-tag  crayons-tag--monochrome " href="/t/datawarehouse"&gt;&lt;span class="crayons-tag__prefix"&gt;#&lt;/span&gt;datawarehouse&lt;/a&gt;
            &lt;a class="crayons-tag  crayons-tag--monochrome " href="/t/dataarchitecture"&gt;&lt;span class="crayons-tag__prefix"&gt;#&lt;/span&gt;dataarchitecture&lt;/a&gt;
            &lt;a class="crayons-tag  crayons-tag--monochrome " href="/t/dataengineering"&gt;&lt;span class="crayons-tag__prefix"&gt;#&lt;/span&gt;dataengineering&lt;/a&gt;
        &lt;/div&gt;
      &lt;div class="crayons-story__bottom"&gt;
        &lt;div class="crayons-story__details"&gt;
          &lt;a href="https://dev.to/allan-pg/unlocking-data-warehouse-magic-kimball-vs-inmon-vs-the-ultimate-hybrid-hack-1g7o" class="crayons-btn crayons-btn--s crayons-btn--ghost crayons-btn--icon-left"&gt;
            &lt;div class="multiple_reactions_aggregate"&gt;
              &lt;span class="multiple_reactions_icons_container"&gt;
                  &lt;span class="crayons_icon_container"&gt;
                    &lt;img src="https://assets.dev.to/assets/fire-f60e7a582391810302117f987b22a8ef04a2fe0df7e3258a5f49332df1cec71e.svg" width="18" height="18"&gt;
                  &lt;/span&gt;
                  &lt;span class="crayons_icon_container"&gt;
                    &lt;img src="https://assets.dev.to/assets/sparkle-heart-5f9bee3767e18deb1bb725290cb151c25234768a0e9a2bd39370c382d02920cf.svg" width="18" height="18"&gt;
                  &lt;/span&gt;
              &lt;/span&gt;
              &lt;span class="aggregate_reactions_counter"&gt;2&lt;span class="hidden s:inline"&gt; reactions&lt;/span&gt;&lt;/span&gt;
            &lt;/div&gt;
          &lt;/a&gt;
            &lt;a href="https://dev.to/allan-pg/unlocking-data-warehouse-magic-kimball-vs-inmon-vs-the-ultimate-hybrid-hack-1g7o#comments" class="crayons-btn crayons-btn--s crayons-btn--ghost crayons-btn--icon-left flex items-center"&gt;
              Comments


              &lt;span class="hidden s:inline"&gt;Add Comment&lt;/span&gt;
            &lt;/a&gt;
        &lt;/div&gt;
        &lt;div class="crayons-story__save"&gt;
          &lt;small class="crayons-story__tertiary fs-xs mr-2"&gt;
            3 min read
          &lt;/small&gt;
            
              &lt;span class="bm-initial"&gt;
                

              &lt;/span&gt;
              &lt;span class="bm-success"&gt;
                

              &lt;/span&gt;
            
        &lt;/div&gt;
      &lt;/div&gt;
    &lt;/div&gt;
  &lt;/div&gt;
&lt;/div&gt;

&lt;/div&gt;


</description>
      <category>database</category>
      <category>datawarehouse</category>
      <category>dataarchitecture</category>
      <category>dataengineering</category>
    </item>
    <item>
      <title>all you need to know on data Warehouse</title>
      <dc:creator>allan-pg</dc:creator>
      <pubDate>Wed, 04 Mar 2026 23:30:29 +0000</pubDate>
      <link>https://forem.com/allan-pg/all-you-need-to-know-on-data-warehouse-3mdh</link>
      <guid>https://forem.com/allan-pg/all-you-need-to-know-on-data-warehouse-3mdh</guid>
      <description>&lt;div class="ltag__link"&gt;
  &lt;a href="/allan-pg" class="ltag__link__link"&gt;
    &lt;div class="ltag__link__pic"&gt;
      &lt;img src="https://media2.dev.to/dynamic/image/width=800%2Cheight=%2Cfit=scale-down%2Cgravity=auto%2Cformat=auto/https%3A%2F%2Fdev-to-uploads.s3.amazonaws.com%2Fuploads%2Fuser%2Fprofile_image%2F895054%2F222ce658-bd10-4166-83f0-d8edd89f535d.png" alt="allan-pg"&gt;
    &lt;/div&gt;
  &lt;/a&gt;
  &lt;a href="https://dev.to/allan-pg/unlocking-data-warehouse-magic-kimball-vs-inmon-vs-the-ultimate-hybrid-hack-1g7o" class="ltag__link__link"&gt;
    &lt;div class="ltag__link__content"&gt;
      &lt;h2&gt;Designing a Modern Data Warehouse: Combining Bill Inmon and Ralph Kimball in a Hybrid Medallion Architecture&lt;/h2&gt;
      &lt;h3&gt;allan-pg ・ Mar 4&lt;/h3&gt;
      &lt;div class="ltag__link__taglist"&gt;
        &lt;span class="ltag__link__tag"&gt;#database&lt;/span&gt;
        &lt;span class="ltag__link__tag"&gt;#datawarehouse&lt;/span&gt;
        &lt;span class="ltag__link__tag"&gt;#dataarchitecture&lt;/span&gt;
        &lt;span class="ltag__link__tag"&gt;#dataengineering&lt;/span&gt;
      &lt;/div&gt;
    &lt;/div&gt;
  &lt;/a&gt;
&lt;/div&gt;


</description>
      <category>database</category>
      <category>datawarehouse</category>
      <category>dataarchitecture</category>
      <category>dataengineering</category>
    </item>
    <item>
      <title>Designing a Modern Data Warehouse: Combining Bill Inmon and Ralph Kimball in a Hybrid Medallion Architecture</title>
      <dc:creator>allan-pg</dc:creator>
      <pubDate>Wed, 04 Mar 2026 10:53:13 +0000</pubDate>
      <link>https://forem.com/allan-pg/unlocking-data-warehouse-magic-kimball-vs-inmon-vs-the-ultimate-hybrid-hack-1g7o</link>
      <guid>https://forem.com/allan-pg/unlocking-data-warehouse-magic-kimball-vs-inmon-vs-the-ultimate-hybrid-hack-1g7o</guid>
      <description>&lt;p&gt;Hey devs and data enthusiasts! Ever felt like your data warehouse is a chaotic junk drawer full of potential but impossible to navigate? You're not alone. In the world of big data, choosing the right architecture can make or break your analytics game. Today, we're diving into two legendary approaches: Ralph Kimball's dimensional magic and Bill Inmon's corporate Warehouse. But here's the twist we'll combine them together into a hybrid powerhouse using modern data layers. We're going to create a blueprint for a Hybrid Inmon-Kimball architecture, where Inmon's structured approach is utilized for integration in the silver layer and Kimball's user-friendly approach is utilized for blazing query speeds in the gold layer! Let's roll!&lt;/p&gt;

&lt;h2&gt;
  
  
  Bill Inmon's Corporate Information Factory (The Top-Down Approach)
&lt;/h2&gt;

&lt;p&gt;Imagine your data warehouse like a very big, well-organized central library. This is the idea behind Bill Inmon’s approach often called the Corporate Information Factory. First, you build an enterprise data warehouse where all data from different systems is cleaned, organized, and stored. An enterprise Data warehouse is normalised upto the 3rd normal form. Think of it like sorting all your LEGO pieces carefully before building anything. After building an enterprise data warehouse, you then create smaller sections called data marts for specific teams like finance or marketing. The advantage is that the data is very accurate, consistent, and reliable across the company. The downside is that it takes a long time to design and build at the beginning, and queries can sometimes be slower because the data is split into many related tables. &lt;/p&gt;

&lt;p&gt;&lt;a href="https://media2.dev.to/dynamic/image/width=800%2Cheight=%2Cfit=scale-down%2Cgravity=auto%2Cformat=auto/https%3A%2F%2Fdev-to-uploads.s3.amazonaws.com%2Fuploads%2Farticles%2Febpgtztz9dh27i6h73l6.png" class="article-body-image-wrapper"&gt;&lt;img src="https://media2.dev.to/dynamic/image/width=800%2Cheight=%2Cfit=scale-down%2Cgravity=auto%2Cformat=auto/https%3A%2F%2Fdev-to-uploads.s3.amazonaws.com%2Fuploads%2Farticles%2Febpgtztz9dh27i6h73l6.png" alt="Bill Inmon's Corporate Information Factory" width="560" height="478"&gt;&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;&lt;strong&gt;Inmon’s idea is simple: build a strong, clean foundation first, then everything else becomes easier.&lt;/strong&gt;&lt;/p&gt;

&lt;h2&gt;
  
  
  Ralph Kimball's Dimensional Modeling (Bottom-Up Approach)
&lt;/h2&gt;

&lt;p&gt;Kimball’s Dimensional Data Warehouse collects data from different operational systems, cleans and integrates it through ETL, and stores it in a central warehouse designed using star schemas. The warehouse keeps detailed (atomic) data but organizes it in a way that is easy for reporting and analysis. In this design, fact tables store measurable business events, usually numerical values like sales amount, quantity sold, or transaction count. However, not every number is a fact. For example, a customer’s phone number or postal code is numeric, but it is not a fact because it does not represent a measurable business event. Facts answer questions like “How much?” “How many?” “How often?”&lt;/p&gt;

&lt;p&gt;Dimension tables contain descriptive information used to filter, group, or label the facts. Examples include customer name, product category, store location, or date. If you ask, “Total sales by product category by month,” the sales amount comes from the fact table, while product category and month come from dimension tables.&lt;/p&gt;

&lt;p&gt;&lt;a href="https://media2.dev.to/dynamic/image/width=800%2Cheight=%2Cfit=scale-down%2Cgravity=auto%2Cformat=auto/https%3A%2F%2Fdev-to-uploads.s3.amazonaws.com%2Fuploads%2Farticles%2F98aiv8f7ujwpfvnnin2z.png" class="article-body-image-wrapper"&gt;&lt;img src="https://media2.dev.to/dynamic/image/width=800%2Cheight=%2Cfit=scale-down%2Cgravity=auto%2Cformat=auto/https%3A%2F%2Fdev-to-uploads.s3.amazonaws.com%2Fuploads%2Farticles%2F98aiv8f7ujwpfvnnin2z.png" alt="Ralph Kimball's Dimensional Modeling" width="720" height="405"&gt;&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;&lt;strong&gt;Kimball’s idea is simple: deliver useful data quickly and make it easy for people to analyze.&lt;/strong&gt;&lt;/p&gt;

&lt;h2&gt;
  
  
  Hybrid Inmon-Kimball in a Medallion Architecture
&lt;/h2&gt;

&lt;p&gt;A hybrid data warehouse model simply combines the strengths of both Inmon and Kimball in a modern layered setup. First, raw data lands in the Bronze layer, just as it comes from source systems. Then in the Silver layer, you apply Inmon’s approach by cleaning, integrating, and organizing the data into a structured, normalized format to create one trusted version of the truth for the whole company. This layer focuses on accuracy, consistency, and removing duplicates. Finally, in the Gold layer, you apply Kimball’s approach by transforming that clean data into easy-to-use star schemas and data marts designed for fast reporting and dashboards.&lt;/p&gt;

&lt;p&gt;&lt;a href="https://media2.dev.to/dynamic/image/width=800%2Cheight=%2Cfit=scale-down%2Cgravity=auto%2Cformat=auto/https%3A%2F%2Fdev-to-uploads.s3.amazonaws.com%2Fuploads%2Farticles%2F69vzwi7nrkquxup7s8em.png" class="article-body-image-wrapper"&gt;&lt;img src="https://media2.dev.to/dynamic/image/width=800%2Cheight=%2Cfit=scale-down%2Cgravity=auto%2Cformat=auto/https%3A%2F%2Fdev-to-uploads.s3.amazonaws.com%2Fuploads%2Farticles%2F69vzwi7nrkquxup7s8em.png" alt="Hybrid Inmon-Kimball" width="638" height="562"&gt;&lt;/a&gt;&lt;/p&gt;

&lt;h2&gt;
  
  
  Wrapping It Up:
&lt;/h2&gt;

&lt;p&gt;It doesn`t matter if you’re building a small personal project or a massive enterprise data system, the key is to start small and build in layers. Think of your data in stages that is raw data, cleaned data, and ready-to-use then pick tools that make it easy to manage, like Snowflake or Databricks. Focus on iterating and improving gradually instead of trying to do everything at once. This way, your data warehouse becomes more flexible, easier to use, and actually helpful for the people who need it.&lt;/p&gt;

</description>
      <category>database</category>
      <category>datawarehouse</category>
      <category>dataarchitecture</category>
      <category>dataengineering</category>
    </item>
    <item>
      <title>From Messy to Masterpiece: Cleaning FIFA Data with Python ⚽🚀</title>
      <dc:creator>allan-pg</dc:creator>
      <pubDate>Wed, 19 Feb 2025 06:57:38 +0000</pubDate>
      <link>https://forem.com/allan-pg/fifa-data-cleaning-1l9a</link>
      <guid>https://forem.com/allan-pg/fifa-data-cleaning-1l9a</guid>
      <description>&lt;h2&gt;
  
  
  Introduction
&lt;/h2&gt;

&lt;p&gt;Data Cleaning is a very important part of data analysis. Clean Data ensures that your analysis is correct and your insights are reliable when used in decision making. It also improves quality of your data&lt;/p&gt;

&lt;p&gt;&lt;strong&gt;Objectives:&lt;/strong&gt;&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;Ensure that all columns are clearly named&lt;/li&gt;
&lt;li&gt;Ensure that columns have the correct datatypes&lt;/li&gt;
&lt;li&gt;Remove all unnecessary information from the dataset
Lets Get Started:&lt;/li&gt;
&lt;/ul&gt;

&lt;h2&gt;
  
  
  Import necessary python libraries
&lt;/h2&gt;



&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight python"&gt;&lt;code&gt;&lt;span class="kn"&gt;import&lt;/span&gt; &lt;span class="n"&gt;numpy&lt;/span&gt; &lt;span class="k"&gt;as&lt;/span&gt; &lt;span class="n"&gt;np&lt;/span&gt;
&lt;span class="kn"&gt;import&lt;/span&gt; &lt;span class="n"&gt;pandas&lt;/span&gt; &lt;span class="k"&gt;as&lt;/span&gt; &lt;span class="n"&gt;pd&lt;/span&gt;
&lt;span class="kn"&gt;import&lt;/span&gt; &lt;span class="n"&gt;warnings&lt;/span&gt;
&lt;span class="n"&gt;warnings&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="nf"&gt;filterwarnings&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="s"&gt;ignore&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt;
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;h2&gt;
  
  
  import the csv data set from the folder where its saved
&lt;/h2&gt;



&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight python"&gt;&lt;code&gt;&lt;span class="n"&gt;df&lt;/span&gt; &lt;span class="o"&gt;=&lt;/span&gt; &lt;span class="n"&gt;pd&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="nf"&gt;read_csv&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="sa"&gt;r&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="s"&gt;...\fifa21 raw data v2.csv&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt;
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;h2&gt;
  
  
  set your data frame such that it displays all columns since the dataset has many columns.
&lt;/h2&gt;



&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight python"&gt;&lt;code&gt;&lt;span class="n"&gt;pd&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="nf"&gt;set_option&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="s"&gt;display.max_rows&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt; &lt;span class="mi"&gt;1000&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt;
&lt;span class="n"&gt;pd&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="nf"&gt;set_option&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="s"&gt;display.max_columns&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt; &lt;span class="mi"&gt;1000&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt;
&lt;span class="n"&gt;pd&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="nf"&gt;set_option&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="s"&gt;display.width&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt; &lt;span class="mi"&gt;1000&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt;
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;The benefit is that not only all columns are well displayed, but also the printed rows can be larger than&lt;br&gt;
the usual ~100 characters limit.&lt;/p&gt;

&lt;h2&gt;
  
  
  Display the first five rows
&lt;/h2&gt;



&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight python"&gt;&lt;code&gt;&lt;span class="n"&gt;df&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="nf"&gt;head&lt;/span&gt;&lt;span class="p"&gt;()&lt;/span&gt;
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;h2&gt;
  
  
  Check how many rows and columns make up your dafaframe
&lt;/h2&gt;



&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight python"&gt;&lt;code&gt;&lt;span class="n"&gt;df&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="n"&gt;shape&lt;/span&gt;
&lt;span class="nf"&gt;print&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="s"&gt;The number of rows are:&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt; &lt;span class="n"&gt;df&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="n"&gt;shape&lt;/span&gt;&lt;span class="p"&gt;[&lt;/span&gt;&lt;span class="mi"&gt;0&lt;/span&gt;&lt;span class="p"&gt;])&lt;/span&gt;
&lt;span class="nf"&gt;print&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="s"&gt;The number of columns are:&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt; &lt;span class="n"&gt;df&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="n"&gt;shape&lt;/span&gt;&lt;span class="p"&gt;[&lt;/span&gt;&lt;span class="mi"&gt;1&lt;/span&gt;&lt;span class="p"&gt;])&lt;/span&gt;
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;h2&gt;
  
  
  Display a list of all column names
&lt;/h2&gt;



&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight python"&gt;&lt;code&gt;&lt;span class="k"&gt;for&lt;/span&gt; &lt;span class="n"&gt;x&lt;/span&gt; &lt;span class="ow"&gt;in&lt;/span&gt; &lt;span class="n"&gt;df&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="n"&gt;columns&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="nf"&gt;tolist&lt;/span&gt;&lt;span class="p"&gt;():&lt;/span&gt;
   &lt;span class="nf"&gt;print&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="n"&gt;x&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt;
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;h2&gt;
  
  
  Make a copy of your data set so as to retain an original copy
&lt;/h2&gt;



&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight python"&gt;&lt;code&gt;&lt;span class="n"&gt;df1&lt;/span&gt; &lt;span class="o"&gt;=&lt;/span&gt; &lt;span class="n"&gt;df&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="nf"&gt;copy&lt;/span&gt;&lt;span class="p"&gt;()&lt;/span&gt;
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;h2&gt;
  
  
  Remove unnecessary columns
&lt;/h2&gt;



&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight python"&gt;&lt;code&gt;&lt;span class="n"&gt;df1&lt;/span&gt; &lt;span class="o"&gt;=&lt;/span&gt; &lt;span class="n"&gt;df1&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="nf"&gt;drop&lt;/span&gt;&lt;span class="p"&gt;([&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="s"&gt;Name&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt; &lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="s"&gt;photoUrl&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt; &lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="s"&gt;playerUrl&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="p"&gt;],&lt;/span&gt; &lt;span class="n"&gt;axis&lt;/span&gt; &lt;span class="o"&gt;=&lt;/span&gt; &lt;span class="mi"&gt;1&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt;
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;h2&gt;
  
  
  Rename columns
&lt;/h2&gt;



&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight python"&gt;&lt;code&gt;&lt;span class="n"&gt;df1&lt;/span&gt; &lt;span class="o"&gt;=&lt;/span&gt; &lt;span class="n"&gt;df1&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="nf"&gt;rename&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="n"&gt;columns&lt;/span&gt; &lt;span class="o"&gt;=&lt;/span&gt; &lt;span class="p"&gt;{&lt;/span&gt;
&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;LongName&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt; &lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;Name&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt;
&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;↓OVA&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;Overall Rating(%)&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt;
&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;POT&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt; &lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;Potential(%)&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt;
&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;BOV&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt; &lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;Best Overall(%)&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt;
&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;BP&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt; &lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;Best Position&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt;
&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;W/F&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt; &lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;Weak Foot&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt;
&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;SM&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt; &lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;Skill Moves&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt;
&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;A/W&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt; &lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;Attacking Work Rate&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt;
&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;D/W&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt; &lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;Defensive Work Rate&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt;
&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;IR&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt; &lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;International Reputation&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt;
&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;PAC&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt; &lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;Pace&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt;
&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;SHO&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt; &lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;Shooting&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt;
&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;PAS&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt; &lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;Passing&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt;
&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;DRI&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt; &lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;Dribbling&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt;
&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;DEF&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt; &lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;Defense&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt;
&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;PHY&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt; &lt;span class="sh"&gt;"&lt;/span&gt;&lt;span class="s"&gt;Physicality&lt;/span&gt;&lt;span class="sh"&gt;"&lt;/span&gt;
&lt;span class="p"&gt;})&lt;/span&gt;
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;h2&gt;
  
  
  Remove whitespaces in the club column
&lt;/h2&gt;



&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight python"&gt;&lt;code&gt;&lt;span class="c1"&gt;#Remove whitespaces from the club column
&lt;/span&gt;&lt;span class="n"&gt;df1&lt;/span&gt;&lt;span class="p"&gt;[&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="s"&gt;Club&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="p"&gt;]&lt;/span&gt; &lt;span class="o"&gt;=&lt;/span&gt; &lt;span class="n"&gt;df1&lt;/span&gt;&lt;span class="p"&gt;[&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="s"&gt;Club&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="p"&gt;].&lt;/span&gt;&lt;span class="nb"&gt;str&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="nf"&gt;lstrip&lt;/span&gt;&lt;span class="p"&gt;()&lt;/span&gt;
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;h2&gt;
  
  
  Clean contract column using a function
&lt;/h2&gt;



&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight python"&gt;&lt;code&gt;&lt;span class="c1"&gt;#define a function to change contract column values
&lt;/span&gt;&lt;span class="k"&gt;def&lt;/span&gt; &lt;span class="nf"&gt;contract_status&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="n"&gt;value&lt;/span&gt;&lt;span class="p"&gt;):&lt;/span&gt;
     &lt;span class="k"&gt;if&lt;/span&gt; &lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="s"&gt;On Loan&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt; &lt;span class="ow"&gt;in&lt;/span&gt; &lt;span class="n"&gt;value&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt;
       &lt;span class="n"&gt;value&lt;/span&gt; &lt;span class="o"&gt;=&lt;/span&gt; &lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="s"&gt;On Loan&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;
       &lt;span class="k"&gt;return&lt;/span&gt; &lt;span class="n"&gt;value&lt;/span&gt;
     &lt;span class="k"&gt;elif&lt;/span&gt; &lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="s"&gt;~&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt; &lt;span class="ow"&gt;in&lt;/span&gt; &lt;span class="n"&gt;value&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt;
       &lt;span class="n"&gt;value&lt;/span&gt; &lt;span class="o"&gt;=&lt;/span&gt; &lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="s"&gt;Active&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;
       &lt;span class="k"&gt;return&lt;/span&gt; &lt;span class="n"&gt;value&lt;/span&gt;
     &lt;span class="k"&gt;else&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt;
       &lt;span class="n"&gt;value&lt;/span&gt; &lt;span class="o"&gt;=&lt;/span&gt; &lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="s"&gt;Free&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;
       &lt;span class="k"&gt;return&lt;/span&gt; &lt;span class="n"&gt;value&lt;/span&gt;

&lt;span class="c1"&gt;#apply the function on contract column
&lt;/span&gt;&lt;span class="n"&gt;df1&lt;/span&gt;&lt;span class="p"&gt;[&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="s"&gt;Contract&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="p"&gt;]&lt;/span&gt; &lt;span class="o"&gt;=&lt;/span&gt; &lt;span class="n"&gt;df1&lt;/span&gt;&lt;span class="p"&gt;[&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="s"&gt;Contract&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="p"&gt;].&lt;/span&gt;&lt;span class="nf"&gt;apply&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="n"&gt;contract_status&lt;/span&gt;&lt;span class="p"&gt;).&lt;/span&gt;&lt;span class="nf"&gt;astype&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="s"&gt;category&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt;

&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;h2&gt;
  
  
  Rename Contract column
&lt;/h2&gt;



&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight python"&gt;&lt;code&gt;&lt;span class="c1"&gt;#Rename the contract column
&lt;/span&gt;&lt;span class="n"&gt;df1&lt;/span&gt; &lt;span class="o"&gt;=&lt;/span&gt; &lt;span class="n"&gt;df1&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="nf"&gt;rename&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="n"&gt;columns&lt;/span&gt; &lt;span class="o"&gt;=&lt;/span&gt; &lt;span class="p"&gt;{&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="s"&gt;Contract&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt; &lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="s"&gt;Contract Status&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="p"&gt;})&lt;/span&gt;
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;h2&gt;
  
  
  Process of cleaning contract column
&lt;/h2&gt;

&lt;ul&gt;
&lt;li&gt;A function was defined to change the row values from '2004 ~ 2021', 'On Loan' and 'Free' to 'Active', 'On Loan' and 'Free'
'Contract' column was renamed to 'Contract Status&lt;/li&gt;
&lt;li&gt;Also, the data type was changed to category.&lt;/li&gt;
&lt;/ul&gt;

&lt;h2&gt;
  
  
  Check Height and Weight Columns
&lt;/h2&gt;



&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight python"&gt;&lt;code&gt;&lt;span class="c1"&gt;#Check for unique values
&lt;/span&gt;&lt;span class="k"&gt;for&lt;/span&gt; &lt;span class="n"&gt;column&lt;/span&gt; &lt;span class="ow"&gt;in&lt;/span&gt; &lt;span class="n"&gt;df&lt;/span&gt;&lt;span class="p"&gt;[[&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="s"&gt;Height&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt; &lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="s"&gt;Weight&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="p"&gt;]]:&lt;/span&gt;
     &lt;span class="n"&gt;value&lt;/span&gt; &lt;span class="o"&gt;=&lt;/span&gt; &lt;span class="n"&gt;df1&lt;/span&gt;&lt;span class="p"&gt;[&lt;/span&gt;&lt;span class="n"&gt;column&lt;/span&gt;&lt;span class="p"&gt;].&lt;/span&gt;&lt;span class="nf"&gt;unique&lt;/span&gt;&lt;span class="p"&gt;()&lt;/span&gt;
     &lt;span class="nf"&gt;print&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="sa"&gt;f&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="si"&gt;{&lt;/span&gt;&lt;span class="n"&gt;column&lt;/span&gt;&lt;span class="si"&gt;}&lt;/span&gt;&lt;span class="se"&gt;\n&lt;/span&gt;&lt;span class="si"&gt;{&lt;/span&gt;&lt;span class="n"&gt;value&lt;/span&gt;&lt;span class="si"&gt;}&lt;/span&gt;&lt;span class="s"&gt;.&lt;/span&gt;&lt;span class="se"&gt;\n&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt;
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;h2&gt;
  
  
  Create Function to convert height to cm
&lt;/h2&gt;



&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight python"&gt;&lt;code&gt;&lt;span class="c1"&gt;#Function to convert height to cm
&lt;/span&gt;&lt;span class="k"&gt;def&lt;/span&gt; &lt;span class="nf"&gt;convert_height&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="n"&gt;value&lt;/span&gt;&lt;span class="p"&gt;):&lt;/span&gt;
   &lt;span class="k"&gt;if&lt;/span&gt; &lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="s"&gt;cm&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt; &lt;span class="ow"&gt;in&lt;/span&gt; &lt;span class="n"&gt;value&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt;
       &lt;span class="n"&gt;value&lt;/span&gt; &lt;span class="o"&gt;=&lt;/span&gt; &lt;span class="nf"&gt;int&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="n"&gt;value&lt;/span&gt;&lt;span class="p"&gt;[:&lt;/span&gt;&lt;span class="o"&gt;-&lt;/span&gt;&lt;span class="mi"&gt;2&lt;/span&gt;&lt;span class="p"&gt;])&lt;/span&gt;
       &lt;span class="k"&gt;return&lt;/span&gt; &lt;span class="n"&gt;value&lt;/span&gt;
   &lt;span class="k"&gt;else&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt;
       &lt;span class="n"&gt;feet&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt; &lt;span class="n"&gt;inches&lt;/span&gt; &lt;span class="o"&gt;=&lt;/span&gt; &lt;span class="n"&gt;value&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="nf"&gt;split&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="sh"&gt;"'"&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt;
       &lt;span class="n"&gt;total_inches&lt;/span&gt; &lt;span class="o"&gt;=&lt;/span&gt; &lt;span class="nf"&gt;int&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="n"&gt;feet&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt; &lt;span class="o"&gt;*&lt;/span&gt; &lt;span class="mi"&gt;12&lt;/span&gt; &lt;span class="o"&gt;+&lt;/span&gt; &lt;span class="nf"&gt;int&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="n"&gt;inches&lt;/span&gt;&lt;span class="p"&gt;[:&lt;/span&gt;&lt;span class="o"&gt;-&lt;/span&gt;&lt;span class="mi"&gt;1&lt;/span&gt;&lt;span class="p"&gt;])&lt;/span&gt;
       &lt;span class="n"&gt;height_cm&lt;/span&gt; &lt;span class="o"&gt;=&lt;/span&gt; &lt;span class="n"&gt;total_inches&lt;/span&gt; &lt;span class="o"&gt;*&lt;/span&gt; &lt;span class="mf"&gt;2.54&lt;/span&gt;
       &lt;span class="k"&gt;return&lt;/span&gt; &lt;span class="nf"&gt;round&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="n"&gt;height_cm&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt; &lt;span class="mi"&gt;2&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt;

&lt;span class="n"&gt;df1&lt;/span&gt;&lt;span class="p"&gt;[&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="s"&gt;Height&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="p"&gt;]&lt;/span&gt; &lt;span class="o"&gt;=&lt;/span&gt; &lt;span class="n"&gt;df1&lt;/span&gt;&lt;span class="p"&gt;[&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="s"&gt;Height&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="p"&gt;].&lt;/span&gt;&lt;span class="nf"&gt;apply&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="n"&gt;convert_height&lt;/span&gt;&lt;span class="p"&gt;).&lt;/span&gt;&lt;span class="nf"&gt;astype&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="s"&gt;int64&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt;
&lt;span class="n"&gt;df1&lt;/span&gt;&lt;span class="p"&gt;[&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="s"&gt;Height&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="p"&gt;].&lt;/span&gt;&lt;span class="nf"&gt;unique&lt;/span&gt;&lt;span class="p"&gt;()&lt;/span&gt;
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;h2&gt;
  
  
  Create function to convert weight to kg
&lt;/h2&gt;



&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight python"&gt;&lt;code&gt;&lt;span class="k"&gt;def&lt;/span&gt; &lt;span class="nf"&gt;convert_weight&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="n"&gt;value&lt;/span&gt;&lt;span class="p"&gt;):&lt;/span&gt;
    &lt;span class="k"&gt;if&lt;/span&gt; &lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="s"&gt;kg&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt; &lt;span class="ow"&gt;in&lt;/span&gt; &lt;span class="n"&gt;value&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt;
       &lt;span class="n"&gt;value&lt;/span&gt; &lt;span class="o"&gt;=&lt;/span&gt; &lt;span class="n"&gt;value&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="nf"&gt;strip&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="s"&gt;kg&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt;
       &lt;span class="k"&gt;return&lt;/span&gt; &lt;span class="n"&gt;value&lt;/span&gt;
    &lt;span class="k"&gt;else&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt;
       &lt;span class="n"&gt;value&lt;/span&gt; &lt;span class="o"&gt;=&lt;/span&gt; &lt;span class="n"&gt;value&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="nf"&gt;strip&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="s"&gt;lbs&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt;
       &lt;span class="n"&gt;Weight&lt;/span&gt; &lt;span class="o"&gt;=&lt;/span&gt; &lt;span class="nf"&gt;round&lt;/span&gt;&lt;span class="p"&gt;((&lt;/span&gt;&lt;span class="nf"&gt;float&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="n"&gt;value&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt; &lt;span class="o"&gt;*&lt;/span&gt; &lt;span class="mf"&gt;0.45359237&lt;/span&gt;&lt;span class="p"&gt;),&lt;/span&gt; &lt;span class="mi"&gt;2&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt;
       &lt;span class="k"&gt;return&lt;/span&gt; &lt;span class="n"&gt;Weight&lt;/span&gt;

&lt;span class="n"&gt;df1&lt;/span&gt;&lt;span class="p"&gt;[&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="s"&gt;Weight&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="p"&gt;]&lt;/span&gt; &lt;span class="o"&gt;=&lt;/span&gt; &lt;span class="n"&gt;df1&lt;/span&gt;&lt;span class="p"&gt;[&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="s"&gt;Weight&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="p"&gt;].&lt;/span&gt;&lt;span class="nf"&gt;apply&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="n"&gt;convert_weight&lt;/span&gt;&lt;span class="p"&gt;).&lt;/span&gt;&lt;span class="nf"&gt;astype&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="s"&gt;int64&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt;
&lt;span class="n"&gt;df1&lt;/span&gt;&lt;span class="p"&gt;[&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="s"&gt;Weight&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="p"&gt;].&lt;/span&gt;&lt;span class="nf"&gt;unique&lt;/span&gt;&lt;span class="p"&gt;()&lt;/span&gt;
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;h2&gt;
  
  
  Process of Cleaning 'Height' and 'Weight' columns
&lt;/h2&gt;

&lt;ol&gt;
&lt;li&gt;Create a function to convert Height&lt;/li&gt;
&lt;li&gt;values with cm remain the same except that we extract only value without 'CM'&lt;/li&gt;
&lt;li&gt;value in feet and inches we convert them to CM&lt;/li&gt;
&lt;li&gt;Lastly, convert Height column to int&lt;/li&gt;
&lt;/ol&gt;

&lt;p&gt;&lt;strong&gt;Process of cleaning Weight column&lt;/strong&gt;&lt;/p&gt;

&lt;ol&gt;
&lt;li&gt;create a function to convert LBS to kgs&lt;/li&gt;
&lt;li&gt;If value is in "KGS" it remains the same&lt;/li&gt;
&lt;li&gt;If value is in "LBS" convert them to KGS by multiplying by 0.4535...&lt;/li&gt;
&lt;/ol&gt;

&lt;h2&gt;
  
  
  Conclusion
&lt;/h2&gt;

&lt;p&gt;In this project, we embarked on a comprehensive data cleaning and transformation journey with the FIFA dataset. Our primary objective was to prepare the data for analysis by addressing inconsistencies, refining data types, and enhancing its usability.&lt;/p&gt;

&lt;p&gt;In conclusion, data cleaning and transformation are foundational steps in any data analysis project.By addressing inconsistencies, refining data types, and enhancing data quality, we have set the stage&lt;br&gt;
for more meaningful and insightful analyses. The clean and structured dataset is now well-equipped for advanced analytics, visualizations, and modeling.&lt;/p&gt;

&lt;p&gt;Follow me on &lt;a href="https://github.com/allan-pg" rel="noopener noreferrer"&gt;Github&lt;/a&gt; and &lt;a href="https://www.linkedin.com/in/allan-mwangi-data-analyst/" rel="noopener noreferrer"&gt;Linked In&lt;/a&gt; for a more details on the project&lt;/p&gt;

</description>
      <category>datascience</category>
      <category>data</category>
      <category>dataengineering</category>
      <category>python</category>
    </item>
    <item>
      <title>📌Are you interested in being a data analyst👩‍💻 that stands out from the rest? This can only happen when you master the skills that no one talks about the ones that turn data into decisions and insights📊 into impact.</title>
      <dc:creator>allan-pg</dc:creator>
      <pubDate>Fri, 07 Feb 2025 13:07:50 +0000</pubDate>
      <link>https://forem.com/allan-pg/are-you-interested-in-being-a-data-analyst-that-stands-out-from-the-rest-this-can-only-happen-3pi5</link>
      <guid>https://forem.com/allan-pg/are-you-interested-in-being-a-data-analyst-that-stands-out-from-the-rest-this-can-only-happen-3pi5</guid>
      <description>&lt;div class="ltag__link"&gt;
  &lt;a href="/allan-pg" class="ltag__link__link"&gt;
    &lt;div class="ltag__link__pic"&gt;
      &lt;img src="https://media2.dev.to/dynamic/image/width=800%2Cheight=%2Cfit=scale-down%2Cgravity=auto%2Cformat=auto/https%3A%2F%2Fdev-to-uploads.s3.amazonaws.com%2Fuploads%2Fuser%2Fprofile_image%2F895054%2F222ce658-bd10-4166-83f0-d8edd89f535d.png" alt="allan-pg"&gt;
    &lt;/div&gt;
  &lt;/a&gt;
  &lt;a href="https://dev.to/allan-pg/beyond-the-code-the-non-technical-superpowers-every-data-analyst-needs-to-succeed-1ich" class="ltag__link__link"&gt;
    &lt;div class="ltag__link__content"&gt;
      &lt;h2&gt;Beyond the Code: The Non-Technical Superpowers Every Data Analyst Needs to Succeed.&lt;/h2&gt;
      &lt;h3&gt;allan-pg ・ Feb 7 '25&lt;/h3&gt;
      &lt;div class="ltag__link__taglist"&gt;
        &lt;span class="ltag__link__tag"&gt;#dataanalysis&lt;/span&gt;
        &lt;span class="ltag__link__tag"&gt;#datascience&lt;/span&gt;
        &lt;span class="ltag__link__tag"&gt;#dataengineering&lt;/span&gt;
        &lt;span class="ltag__link__tag"&gt;#softskills&lt;/span&gt;
      &lt;/div&gt;
    &lt;/div&gt;
  &lt;/a&gt;
&lt;/div&gt;


</description>
      <category>dataanalysis</category>
      <category>datascience</category>
      <category>dataengineering</category>
      <category>data</category>
    </item>
    <item>
      <title>Beyond the Code: The Non-Technical Superpowers Every Data Analyst Needs to Succeed.</title>
      <dc:creator>allan-pg</dc:creator>
      <pubDate>Fri, 07 Feb 2025 13:01:20 +0000</pubDate>
      <link>https://forem.com/allan-pg/beyond-the-code-the-non-technical-superpowers-every-data-analyst-needs-to-succeed-1ich</link>
      <guid>https://forem.com/allan-pg/beyond-the-code-the-non-technical-superpowers-every-data-analyst-needs-to-succeed-1ich</guid>
      <description>&lt;h2&gt;
  
  
  Introduction
&lt;/h2&gt;

&lt;p&gt;When I was starting out on data analysis, I thought it is all about beautiful dashboards, writing complex SQL Queries and Python script. While technical skills are important, non-technical skills will make you a top notch data analyst that stands out from the rest. These skills will help you deliver impactful insights from raw data, making your work even more valuable.&lt;/p&gt;

&lt;p&gt;These are just some crucial non-technical skills every data analyst should develop and how essential they are in today's data-driven world.&lt;/p&gt;

&lt;h2&gt;
  
  
  1. Problem-Solving
&lt;/h2&gt;

&lt;p&gt;As a data analyst you are always solving problems, from understanding how do you retain customers to why are sales declining you find solutions to problems.&lt;/p&gt;

&lt;h3&gt;
  
  
  How to develop skills in problem-solving
&lt;/h3&gt;

&lt;ul&gt;
&lt;li&gt;Continuous improvement of your projects constantly&lt;/li&gt;
&lt;li&gt;Being able to break down complex project's into small and manageable tasks&lt;/li&gt;
&lt;li&gt;Learning different methods of analyzing your projects&lt;/li&gt;
&lt;/ul&gt;

&lt;h2&gt;
  
  
  2. Collaborating and Working together as a team
&lt;/h2&gt;

&lt;p&gt;You will often be working as a team to ensure better results and outcomes. For example you will need to work with the marketing team to ensure there is better sales. Good collaboration ensures smoother workflows and better outcomes.&lt;/p&gt;

&lt;h3&gt;
  
  
  How to develop skills in team work
&lt;/h3&gt;

&lt;ul&gt;
&lt;li&gt;communicate clearly with other team members&lt;/li&gt;
&lt;li&gt;Work on projects where you work with other data analysts&lt;/li&gt;
&lt;li&gt;Accept feedbacks and critics from others for continuous improvement&lt;/li&gt;
&lt;/ul&gt;

&lt;h2&gt;
  
  
  3. Attention to details
&lt;/h2&gt;

&lt;p&gt;A small mistake in your data can lead to incorrect conclusions and poor decision-making. Attention to detail ensures accuracy of insights and reliability of your analysis.&lt;/p&gt;

&lt;h3&gt;
  
  
  How to develop skills in attention to details
&lt;/h3&gt;

&lt;ul&gt;
&lt;li&gt;Ensure data is correct by double checking the source of data and formulas used in calculations.&lt;/li&gt;
&lt;li&gt;Use automation tools to double check your your data and the original data.&lt;/li&gt;
&lt;/ul&gt;

&lt;h2&gt;
  
  
  4. Ability to stay up to date
&lt;/h2&gt;

&lt;p&gt;Data analysis and software development is always evolving. You should always be ready to explore new tools that helps you stay ahead of the curve.&lt;/p&gt;

&lt;h3&gt;
  
  
  How to stay up to date
&lt;/h3&gt;

&lt;ul&gt;
&lt;li&gt;Reading blogs and articles published online&lt;/li&gt;
&lt;li&gt;Participating in online competitions and workshops&lt;/li&gt;
&lt;/ul&gt;

&lt;h2&gt;
  
  
  5. Ability to communicate effectively
&lt;/h2&gt;

&lt;p&gt;As a data analyst you should be able to communicate insights to stake holders effectively. Lack of proper communication skills means you cant communicate insights effectively which makes your analysis lose value.&lt;/p&gt;

&lt;h2&gt;
  
  
  How to communicate effectively
&lt;/h2&gt;

&lt;ul&gt;
&lt;li&gt;Practice explaining your findings to non-technical audiences&lt;/li&gt;
&lt;li&gt;Use visualizations to explain findings effectively&lt;/li&gt;
&lt;/ul&gt;

&lt;h2&gt;
  
  
  Conclusion
&lt;/h2&gt;

&lt;p&gt;Non-technical skills like communication, critical thinking, and business acumen are what truly elevates your impact as a data analyst. These skills help you translate complex data into actionable insights, foster better collaboration with teams, and align your work with broader business goals.&lt;/p&gt;

&lt;p&gt;When you are learning don't just focus on coding but invest time in improving non technical skills.&lt;/p&gt;

&lt;p&gt;&lt;em&gt;What non-technical skills have made the biggest difference in your data analytics journey? Share your thoughts in the comments below!&lt;/em&gt;&lt;/p&gt;

</description>
      <category>dataanalysis</category>
      <category>datascience</category>
      <category>dataengineering</category>
      <category>softskills</category>
    </item>
    <item>
      <title>Mastering Data Wrangling: A Simple Guide for Developers</title>
      <dc:creator>allan-pg</dc:creator>
      <pubDate>Thu, 06 Feb 2025 07:16:05 +0000</pubDate>
      <link>https://forem.com/allan-pg/mastering-data-wrangling-a-simple-guide-for-developers-2e93</link>
      <guid>https://forem.com/allan-pg/mastering-data-wrangling-a-simple-guide-for-developers-2e93</guid>
      <description>&lt;h2&gt;
  
  
  Introduction
&lt;/h2&gt;

&lt;p&gt;Data wrangling is the process of turning raw data into useful data. This process involves cleaning, structuring, and enriching raw data for analysis. &lt;/p&gt;

&lt;h2&gt;
  
  
  What is Data Wrangling?
&lt;/h2&gt;

&lt;p&gt;Data wrangling is the process of transforming and organizing raw data into a structured format. It is also known as data munging. It involves:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;
&lt;strong&gt;Data Cleaning&lt;/strong&gt;: Removing duplicates from your dataset, handling missing values, and correcting errors.&lt;/li&gt;
&lt;li&gt;
&lt;strong&gt;Data Transformation&lt;/strong&gt;: Changing formats, normalizing, and encoding data.&lt;/li&gt;
&lt;li&gt;
&lt;strong&gt;Data Integration&lt;/strong&gt;: Combining data from different sources to a unified view.&lt;/li&gt;
&lt;li&gt;
&lt;strong&gt;Data Enrichment&lt;/strong&gt;: Adding new relevant information to your dataset .&lt;/li&gt;
&lt;/ul&gt;

&lt;h3&gt;
  
  
  &lt;strong&gt;Why is Data Wrangling Important?&lt;/strong&gt;
&lt;/h3&gt;

&lt;p&gt;Raw data is often incomplete, inconsistent, and unstructured. Without proper wrangling, analysis can lead to incorrect conclusions. &lt;/p&gt;

&lt;h2&gt;
  
  
  Importance of data wrangling
&lt;/h2&gt;

&lt;p&gt;Well-prepared data ensures:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;Better model accuracy for machine learning.&lt;/li&gt;
&lt;li&gt;Improved decision-making in businesses.&lt;/li&gt;
&lt;li&gt;Enhanced data visualization and reporting.&lt;/li&gt;
&lt;/ul&gt;

&lt;h3&gt;
  
  
  Common Data Wrangling Techniques
&lt;/h3&gt;

&lt;h4&gt;
  
  
  Handling Missing Data
&lt;/h4&gt;



&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight python"&gt;&lt;code&gt;&lt;span class="kn"&gt;import&lt;/span&gt; &lt;span class="n"&gt;pandas&lt;/span&gt; &lt;span class="k"&gt;as&lt;/span&gt; &lt;span class="n"&gt;pd&lt;/span&gt;

&lt;span class="n"&gt;data&lt;/span&gt; &lt;span class="o"&gt;=&lt;/span&gt; &lt;span class="p"&gt;{&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="s"&gt;Name&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt; &lt;span class="p"&gt;[&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="s"&gt;Alice&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt; &lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="s"&gt;Bob&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt; &lt;span class="bp"&gt;None&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt; &lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="s"&gt;David&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="p"&gt;],&lt;/span&gt; &lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="s"&gt;Age&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt; &lt;span class="p"&gt;[&lt;/span&gt;&lt;span class="mi"&gt;25&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt; &lt;span class="bp"&gt;None&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt; &lt;span class="mi"&gt;30&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt; &lt;span class="mi"&gt;40&lt;/span&gt;&lt;span class="p"&gt;]}&lt;/span&gt;
&lt;span class="n"&gt;df&lt;/span&gt; &lt;span class="o"&gt;=&lt;/span&gt; &lt;span class="n"&gt;pd&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="nc"&gt;DataFrame&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="n"&gt;data&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt;
&lt;span class="nf"&gt;print&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="n"&gt;df&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="nf"&gt;isnull&lt;/span&gt;&lt;span class="p"&gt;().&lt;/span&gt;&lt;span class="nf"&gt;sum&lt;/span&gt;&lt;span class="p"&gt;())&lt;/span&gt;  &lt;span class="c1"&gt;# Check missing values
&lt;/span&gt;
&lt;span class="n"&gt;df&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="nf"&gt;fillna&lt;/span&gt;&lt;span class="p"&gt;({&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="s"&gt;Name&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt; &lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="s"&gt;Unknown&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt; &lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="s"&gt;Age&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt; &lt;span class="n"&gt;df&lt;/span&gt;&lt;span class="p"&gt;[&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="s"&gt;Age&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="p"&gt;].&lt;/span&gt;&lt;span class="nf"&gt;mean&lt;/span&gt;&lt;span class="p"&gt;()},&lt;/span&gt; &lt;span class="n"&gt;inplace&lt;/span&gt;&lt;span class="o"&gt;=&lt;/span&gt;&lt;span class="bp"&gt;True&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt;
&lt;span class="nf"&gt;print&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="n"&gt;df&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt;  &lt;span class="c1"&gt;# Fill missing values
&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;h4&gt;
  
  
  Removing Duplicates
&lt;/h4&gt;



&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight python"&gt;&lt;code&gt;&lt;span class="n"&gt;df&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="nf"&gt;drop_duplicates&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="n"&gt;inplace&lt;/span&gt;&lt;span class="o"&gt;=&lt;/span&gt;&lt;span class="bp"&gt;True&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt;
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;h4&gt;
  
  
  Changing Data Types
&lt;/h4&gt;



&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight python"&gt;&lt;code&gt;&lt;span class="n"&gt;df&lt;/span&gt;&lt;span class="p"&gt;[&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="s"&gt;Age&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="p"&gt;]&lt;/span&gt; &lt;span class="o"&gt;=&lt;/span&gt; &lt;span class="n"&gt;df&lt;/span&gt;&lt;span class="p"&gt;[&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="s"&gt;Age&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="p"&gt;].&lt;/span&gt;&lt;span class="nf"&gt;astype&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="nb"&gt;int&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt;
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;h4&gt;
  
  
  Normalizing Data
&lt;/h4&gt;



&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight python"&gt;&lt;code&gt;&lt;span class="n"&gt;df&lt;/span&gt;&lt;span class="p"&gt;[&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="s"&gt;Age&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="p"&gt;]&lt;/span&gt; &lt;span class="o"&gt;=&lt;/span&gt; &lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="n"&gt;df&lt;/span&gt;&lt;span class="p"&gt;[&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="s"&gt;Age&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="p"&gt;]&lt;/span&gt; &lt;span class="o"&gt;-&lt;/span&gt; &lt;span class="n"&gt;df&lt;/span&gt;&lt;span class="p"&gt;[&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="s"&gt;Age&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="p"&gt;].&lt;/span&gt;&lt;span class="nf"&gt;min&lt;/span&gt;&lt;span class="p"&gt;())&lt;/span&gt; &lt;span class="o"&gt;/&lt;/span&gt; &lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="n"&gt;df&lt;/span&gt;&lt;span class="p"&gt;[&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="s"&gt;Age&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="p"&gt;].&lt;/span&gt;&lt;span class="nf"&gt;max&lt;/span&gt;&lt;span class="p"&gt;()&lt;/span&gt; &lt;span class="o"&gt;-&lt;/span&gt; &lt;span class="n"&gt;df&lt;/span&gt;&lt;span class="p"&gt;[&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="s"&gt;Age&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="p"&gt;].&lt;/span&gt;&lt;span class="nf"&gt;min&lt;/span&gt;&lt;span class="p"&gt;())&lt;/span&gt;
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;h4&gt;
  
  
  Merging DataFrames
&lt;/h4&gt;



&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight python"&gt;&lt;code&gt;&lt;span class="n"&gt;data2&lt;/span&gt; &lt;span class="o"&gt;=&lt;/span&gt; &lt;span class="p"&gt;{&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="s"&gt;Name&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt; &lt;span class="p"&gt;[&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="s"&gt;Alice&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt; &lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="s"&gt;Bob&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt; &lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="s"&gt;David&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="p"&gt;],&lt;/span&gt; &lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="s"&gt;Salary&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt; &lt;span class="p"&gt;[&lt;/span&gt;&lt;span class="mi"&gt;50000&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt; &lt;span class="mi"&gt;55000&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt; &lt;span class="mi"&gt;60000&lt;/span&gt;&lt;span class="p"&gt;]}&lt;/span&gt;
&lt;span class="n"&gt;df2&lt;/span&gt; &lt;span class="o"&gt;=&lt;/span&gt; &lt;span class="n"&gt;pd&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="nc"&gt;DataFrame&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="n"&gt;data2&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt;
&lt;span class="n"&gt;merged_df&lt;/span&gt; &lt;span class="o"&gt;=&lt;/span&gt; &lt;span class="n"&gt;pd&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="nf"&gt;merge&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="n"&gt;df&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt; &lt;span class="n"&gt;df2&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt; &lt;span class="n"&gt;on&lt;/span&gt;&lt;span class="o"&gt;=&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="s"&gt;Name&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt; &lt;span class="n"&gt;how&lt;/span&gt;&lt;span class="o"&gt;=&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="s"&gt;left&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt;
&lt;span class="nf"&gt;print&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="n"&gt;merged_df&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt;
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;h3&gt;
  
  
  MY GO-TO Tools for Data Wrangling
&lt;/h3&gt;

&lt;ul&gt;
&lt;li&gt;
&lt;strong&gt;Pandas&lt;/strong&gt;: Powerful Python library for handling structured data.&lt;/li&gt;
&lt;li&gt;
&lt;strong&gt;NumPy&lt;/strong&gt;: Useful for handling numerical operations.&lt;/li&gt;
&lt;li&gt;
&lt;strong&gt;SQL&lt;/strong&gt;: For structured data manipulation.&lt;/li&gt;
&lt;/ul&gt;

&lt;h3&gt;
  
  
  Final Thoughts
&lt;/h3&gt;

&lt;p&gt;Data wrangling is an important step in any data project. Clean and structured data ensures accurate insights and better decision-making. &lt;/p&gt;

&lt;p&gt;What’s your go-to method for data wrangling? Let me know in the comments!&lt;/p&gt;

</description>
      <category>beginners</category>
      <category>python</category>
      <category>datascience</category>
      <category>tutorial</category>
    </item>
    <item>
      <title>Beginners guide to Machine Learning</title>
      <dc:creator>allan-pg</dc:creator>
      <pubDate>Sat, 02 Nov 2024 09:33:41 +0000</pubDate>
      <link>https://forem.com/allan-pg/beginners-guide-to-machine-learning-2l89</link>
      <guid>https://forem.com/allan-pg/beginners-guide-to-machine-learning-2l89</guid>
      <description>&lt;p&gt;Have you ever thought when you search for a product online, now you start getting recommendation's of similar products on websites like amazon? When typing something on your phone, your keyboard has word suggestions of the word you are about to type? All this has been made possible by the application of &lt;strong&gt;Machine Learning.&lt;/strong&gt;&lt;/p&gt;

&lt;p&gt;&lt;em&gt;Machine Learning is a way of teaching computers to make predictions without having to program it for every single task. &lt;br&gt;
Machine Learning is a subset of Artificial intelligence(AI)&lt;/em&gt;&lt;/p&gt;

&lt;p&gt;Your friend started out in car sales and he has made thousands of dollars. As a data scientist he has offered you to be his business partner. He supplies the money and you will make models to predict how much the cars are to be sold. As a data scientist you ask your friend which criteria they used to predict price of a car and you identify they used prices from the past to predict the price of a car.&lt;/p&gt;

&lt;p&gt;&lt;strong&gt;Machine Learning&lt;/strong&gt; allows you to predict prices in the same way but in a more accurate manner. We will be using Decision Trees for a clear explanation because they are easier to understand. Your friend was using the below decision tree model to predict future prices.&lt;/p&gt;

&lt;p&gt;&lt;a href="https://media2.dev.to/dynamic/image/width=800%2Cheight=%2Cfit=scale-down%2Cgravity=auto%2Cformat=auto/https%3A%2F%2Fdev-to-uploads.s3.amazonaws.com%2Fuploads%2Farticles%2Fv69batjkf61v9n3aarf6.png" class="article-body-image-wrapper"&gt;&lt;img src="https://media2.dev.to/dynamic/image/width=800%2Cheight=%2Cfit=scale-down%2Cgravity=auto%2Cformat=auto/https%3A%2F%2Fdev-to-uploads.s3.amazonaws.com%2Fuploads%2Farticles%2Fv69batjkf61v9n3aarf6.png" alt="Decision Tree Model" width="596" height="410"&gt;&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;This model only divides cars in two categories. The biggest disadvantage of this model is does not capture most factors affecting the price of cars. For example age of the car, make and model, mileage, accident history if any, the condition of the car and etc. You can use a more deeper tree to include more factors. &lt;/p&gt;

&lt;p&gt;&lt;a href="https://media2.dev.to/dynamic/image/width=800%2Cheight=%2Cfit=scale-down%2Cgravity=auto%2Cformat=auto/https%3A%2F%2Fdev-to-uploads.s3.amazonaws.com%2Fuploads%2Farticles%2Fxnzre94yyhc7apy4sse5.png" class="article-body-image-wrapper"&gt;&lt;img src="https://media2.dev.to/dynamic/image/width=800%2Cheight=%2Cfit=scale-down%2Cgravity=auto%2Cformat=auto/https%3A%2F%2Fdev-to-uploads.s3.amazonaws.com%2Fuploads%2Farticles%2Fxnzre94yyhc7apy4sse5.png" alt="Decision Tree" width="723" height="413"&gt;&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;The above model has now included more factors first is the car manual or automatic and what is its engine capacity. These factors are refereed to as &lt;strong&gt;features&lt;/strong&gt; in your model and the price we are predicting is the &lt;strong&gt;prediction target&lt;/strong&gt;&lt;/p&gt;

&lt;h2&gt;
  
  
  Steps to building and using a Machine learning model
&lt;/h2&gt;

&lt;ul&gt;
&lt;li&gt;
&lt;strong&gt;Data Collection:&lt;/strong&gt; You will need to gather data you will need to build your model. For example pictures, past data on car sales for our case.&lt;/li&gt;
&lt;li&gt;Define: Find out What type of model bests suites what you need to achieve. &lt;/li&gt;
&lt;li&gt;
&lt;strong&gt;Fit:&lt;/strong&gt; Train the model by showing it your data and letting it learn. &lt;/li&gt;
&lt;li&gt;
&lt;strong&gt;Predict:&lt;/strong&gt; The model makes predictions, and you tell it whether it's right or wrong. Over time, it gets better.&lt;/li&gt;
&lt;li&gt;
&lt;strong&gt;Evaluate:&lt;/strong&gt; Determine how accurate the model's predictions are.&lt;/li&gt;
&lt;/ul&gt;

&lt;h2&gt;
  
  
  Applications of Machine Learning
&lt;/h2&gt;

&lt;ul&gt;
&lt;li&gt;Recommendations: Based on your recent search history for example on YouTube, You will have videos being recommended to you based on your search history.&lt;/li&gt;
&lt;li&gt;Spam Filtering of messages: On your message app you have a spam folder that contains all messages that have been filtered out as spams.&lt;/li&gt;
&lt;li&gt;Detecting fraudulent activities: Banks are now using machine learning to help curb tax evasions and detect fraudulent transactions. If your bank account transacts an average of $3,000 per month then if it starts transacting over $50,000 then that raises an eye brow.&lt;/li&gt;
&lt;li&gt;Transport: A more practical example google map can help you choose the best alternative route based on traffic jam, distance which can help reduce the time you will take and the amount of fuel needed.&lt;/li&gt;
&lt;/ul&gt;

&lt;h2&gt;
  
  
  Conclusion
&lt;/h2&gt;

&lt;p&gt;Machine Learning is used to interpret patterns in your data for a better decision making and prediction. Machine Learning allows you to feed your algorithm large amounts of data and it analyzes the data for better decision making. Instead of humans writing code for every action, we show the computer lots of examples, and it learns from those.&lt;/p&gt;

</description>
      <category>machinelearning</category>
      <category>ai</category>
      <category>datascience</category>
      <category>data</category>
    </item>
    <item>
      <title>Mastering SQL Optimization: A Beginner’s Guide to Faster and More Efficient Queries</title>
      <dc:creator>allan-pg</dc:creator>
      <pubDate>Sun, 27 Oct 2024 07:32:21 +0000</pubDate>
      <link>https://forem.com/allan-pg/mastering-sql-optimization-a-beginners-guide-to-faster-and-more-efficient-queries-32ka</link>
      <guid>https://forem.com/allan-pg/mastering-sql-optimization-a-beginners-guide-to-faster-and-more-efficient-queries-32ka</guid>
      <description>&lt;h2&gt;
  
  
  Introduction
&lt;/h2&gt;

&lt;p&gt;Has it ever crossed your mind why some SQL queries run in a flash while others take forever?  &lt;/p&gt;

&lt;p&gt;I recently came across SQL Queries that yes, they retrieve the required data but the query have longer wait times before they execute. As a database admin it is your duty to ensure SQL Queries takes as less time as possible to execute and they are efficient. This in turn ensures faster retrieval of data while using less system resources.&lt;/p&gt;

&lt;h2&gt;
  
  
  What is SQL Optimization and why is it useful
&lt;/h2&gt;

&lt;p&gt;SQL optimization is the process writing SQL queries that retrieves data quickly and efficiently. In large databases, unoptimized SQL queries often take long to execute and in return they use a lot of system resources. SQL optimization helps solve this problem by ensuring less execution time and less usage of system resources.&lt;/p&gt;

&lt;h2&gt;
  
  
  Techniques to ensure your SQL queries are optimized
&lt;/h2&gt;

&lt;h3&gt;
  
  
  Select only the columns you need
&lt;/h3&gt;

&lt;p&gt;When writing SQL query ensure you select only the columns you need in your output instead of selecting all columns&lt;/p&gt;

&lt;p&gt;Example:&lt;br&gt;
Instead of&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight sql"&gt;&lt;code&gt;&lt;span class="k"&gt;SELECT&lt;/span&gt; &lt;span class="o"&gt;*&lt;/span&gt;
&lt;span class="k"&gt;FROM&lt;/span&gt; &lt;span class="n"&gt;table1&lt;/span&gt;
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;use&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight sql"&gt;&lt;code&gt;&lt;span class="k"&gt;SELECT&lt;/span&gt; &lt;span class="n"&gt;col1&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt; &lt;span class="n"&gt;col2&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt; &lt;span class="n"&gt;col3&lt;/span&gt;
&lt;span class="k"&gt;FROM&lt;/span&gt; &lt;span class="n"&gt;table1&lt;/span&gt;
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;h3&gt;
  
  
  Use LIMIT clause
&lt;/h3&gt;

&lt;p&gt;LIMIT clause to restrict your SQL to return only the necessary rows you need instead of all rows&lt;/p&gt;

&lt;p&gt;Example:&lt;br&gt;
You need to get the top 10 most paid employees &lt;/p&gt;

&lt;p&gt;Instead of&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight sql"&gt;&lt;code&gt;&lt;span class="k"&gt;SELECT&lt;/span&gt; &lt;span class="n"&gt;name&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt; &lt;span class="n"&gt;age&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt; &lt;span class="n"&gt;salary&lt;/span&gt;
&lt;span class="k"&gt;FROM&lt;/span&gt; &lt;span class="n"&gt;employees&lt;/span&gt;
&lt;span class="k"&gt;ORDER&lt;/span&gt; &lt;span class="k"&gt;BY&lt;/span&gt; &lt;span class="n"&gt;salary&lt;/span&gt; &lt;span class="k"&gt;DESC&lt;/span&gt;&lt;span class="p"&gt;;&lt;/span&gt;
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;Do this&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight sql"&gt;&lt;code&gt;&lt;span class="k"&gt;SELECT&lt;/span&gt; &lt;span class="n"&gt;name&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt; &lt;span class="n"&gt;age&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt; &lt;span class="n"&gt;salary&lt;/span&gt;
&lt;span class="k"&gt;FROM&lt;/span&gt; &lt;span class="n"&gt;employees&lt;/span&gt;
&lt;span class="k"&gt;ORDER&lt;/span&gt; &lt;span class="k"&gt;BY&lt;/span&gt; &lt;span class="n"&gt;salary&lt;/span&gt; &lt;span class="k"&gt;DESC&lt;/span&gt;
&lt;span class="k"&gt;LIMIT&lt;/span&gt; &lt;span class="mi"&gt;10&lt;/span&gt;&lt;span class="p"&gt;;&lt;/span&gt;
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;h3&gt;
  
  
  Use indexes
&lt;/h3&gt;

&lt;p&gt;Use indexes in your database especially on tables you will use SELECT, JOIN, WHERE, ORDER BY. Although, avoid indexing every column, as this can increase storage and slow down INSERT, UPDATE, and DELETE &lt;br&gt;
operations.&lt;br&gt;&lt;br&gt;
&lt;strong&gt;To create an index:&lt;/strong&gt;&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight sql"&gt;&lt;code&gt;&lt;span class="k"&gt;CREATE&lt;/span&gt; &lt;span class="k"&gt;INDEX&lt;/span&gt; &lt;span class="n"&gt;idx_customer_id&lt;/span&gt; 
&lt;span class="k"&gt;ON&lt;/span&gt; &lt;span class="n"&gt;orders&lt;/span&gt; &lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="n"&gt;customer_id&lt;/span&gt;&lt;span class="p"&gt;);&lt;/span&gt;
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;h3&gt;
  
  
  Avoid using functions on indexed columns
&lt;/h3&gt;

&lt;p&gt;Using functions on indexed columns in the WHERE clause will slow down the query since it makes the database not use the index. Instead make sure your data in your columns are stored in the format you may need them stored. For example:&lt;br&gt;
Instead of&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight sql"&gt;&lt;code&gt;&lt;span class="k"&gt;SELECT&lt;/span&gt; &lt;span class="n"&gt;id&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt; &lt;span class="n"&gt;name&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt; &lt;span class="n"&gt;age&lt;/span&gt;
&lt;span class="k"&gt;FROM&lt;/span&gt; &lt;span class="n"&gt;employee&lt;/span&gt;
&lt;span class="k"&gt;WHERE&lt;/span&gt; &lt;span class="k"&gt;LOWER&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="n"&gt;f_name&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt; &lt;span class="o"&gt;=&lt;/span&gt; &lt;span class="s1"&gt;'james'&lt;/span&gt;&lt;span class="p"&gt;;&lt;/span&gt;
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;Ensure first name in the employee table is stored in lower case to avoid using a function in the indexed column f_name&lt;/p&gt;

&lt;h3&gt;
  
  
  Optimize JOINS in SQL
&lt;/h3&gt;

&lt;p&gt;Another method of optimizing your SQL queries is by ensuring you Optimize join queries in SQL to get the fastest result possible.&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;Use indexes:  You should create indexes on columns used in join conditions&lt;/li&gt;
&lt;li&gt;Use INNER JOIN instead of other types of join: INNER JOIN is generally faster than OUTER JOIN because it only returns rows where there is a match in both tables.&lt;/li&gt;
&lt;/ul&gt;

&lt;h3&gt;
  
  
  Use appropriate Data Types
&lt;/h3&gt;

&lt;p&gt;Ensure you use the correct data types for columns, and avoid using large data types for columns that has smaller data. For example, In the first name column don`t use VARCHAR(255) which is the maximum since nobody has a first name with over 200 characters instead use VARCHAR(50)&lt;/p&gt;

&lt;h3&gt;
  
  
  Example of Query Optimization
&lt;/h3&gt;

&lt;p&gt;Instead of&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight sql"&gt;&lt;code&gt;&lt;span class="k"&gt;SELECT&lt;/span&gt; &lt;span class="o"&gt;*&lt;/span&gt;
&lt;span class="k"&gt;FROM&lt;/span&gt; &lt;span class="n"&gt;employee&lt;/span&gt;
&lt;span class="k"&gt;LEFT&lt;/span&gt; &lt;span class="k"&gt;JOIN&lt;/span&gt;  &lt;span class="n"&gt;department&lt;/span&gt; &lt;span class="k"&gt;ON&lt;/span&gt; &lt;span class="n"&gt;employee&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="n"&gt;department_id&lt;/span&gt; &lt;span class="o"&gt;=&lt;/span&gt;  &lt;span class="n"&gt;department&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="n"&gt;department_id&lt;/span&gt;
&lt;span class="k"&gt;WHERE&lt;/span&gt; &lt;span class="k"&gt;UPPER&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="n"&gt;employee&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="n"&gt;last_name&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt; &lt;span class="o"&gt;=&lt;/span&gt; &lt;span class="s1"&gt;'SMITH'&lt;/span&gt;
&lt;span class="k"&gt;ORDER&lt;/span&gt; &lt;span class="k"&gt;BY&lt;/span&gt; &lt;span class="n"&gt;employeeage&lt;/span&gt; &lt;span class="k"&gt;DESC&lt;/span&gt;&lt;span class="p"&gt;;&lt;/span&gt;
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;

&lt;p&gt;&lt;strong&gt;Optimizing the SQL Query above&lt;/strong&gt;&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;Specify the columns you need instead of selecting all columns&lt;/li&gt;
&lt;li&gt;Avoid using LEFT Join since no employee that can be without a department instead use INNER JOIN&lt;/li&gt;
&lt;li&gt;Avoid functions on indexed column i.e UPPER on last_name&lt;/li&gt;
&lt;li&gt;instead index customer_id
&lt;code&gt;&lt;/code&gt;`
&lt;img src="https://media2.dev.to/dynamic/image/width=800%2Cheight=%2Cfit=scale-down%2Cgravity=auto%2Cformat=auto/https%3A%2F%2Fdev-to-uploads.s3.amazonaws.com%2Fuploads%2Farticles%2Fj9ere6sfeojepl1o0tsm.png" alt="optimized query" width="800" height="376"&gt;
&lt;/li&gt;
&lt;/ul&gt;

&lt;h2&gt;
  
  
  Conclusion
&lt;/h2&gt;

&lt;p&gt;By following these best practices in large databases, SQL optimization can greatly improve performance of your application&lt;/p&gt;

</description>
      <category>sqlserver</category>
      <category>sql</category>
      <category>database</category>
      <category>datascience</category>
    </item>
    <item>
      <title>Beginners guide to master web scrapping in python</title>
      <dc:creator>allan-pg</dc:creator>
      <pubDate>Thu, 24 Oct 2024 08:25:58 +0000</pubDate>
      <link>https://forem.com/allan-pg/getting-started-with-web-scraping-in-python-4pdg</link>
      <guid>https://forem.com/allan-pg/getting-started-with-web-scraping-in-python-4pdg</guid>
      <description>&lt;h2&gt;
  
  
  Introduction
&lt;/h2&gt;

&lt;p&gt;Recently I dived into scrapping data from YouTube by use of YouTube API to extract valuable information from various podcasts channels in Kenya. What I discovered is that there is a lot of valuable information on the web that can be used to get valuable insights. Ipon making this discovery I resorted to learn what is data scraping and how can we extract data from the web about a product or lets say anything you can think of as a data analyst.&lt;/p&gt;

&lt;h2&gt;
  
  
  What is web scrapping?
&lt;/h2&gt;

&lt;p&gt;Web Scrapping is the process of extracting valuable information from various web pages. This information can be in form of text, pictures or links found on web pages. Web scrapping is used to perform price monitoring, price intelligence, news monitoring, lead generation, and market research.&lt;/p&gt;

&lt;p&gt;&lt;a href="https://media2.dev.to/dynamic/image/width=800%2Cheight=%2Cfit=scale-down%2Cgravity=auto%2Cformat=auto/https%3A%2F%2Fdev-to-uploads.s3.amazonaws.com%2Fuploads%2Farticles%2Fybmolep2y8cisuujtu9r.png" class="article-body-image-wrapper"&gt;&lt;img src="https://media2.dev.to/dynamic/image/width=800%2Cheight=%2Cfit=scale-down%2Cgravity=auto%2Cformat=auto/https%3A%2F%2Fdev-to-uploads.s3.amazonaws.com%2Fuploads%2Farticles%2Fybmolep2y8cisuujtu9r.png" alt="uses of web scrapping" width="732" height="347"&gt;&lt;/a&gt;&lt;/p&gt;

&lt;h2&gt;
  
  
  Necessary tools and libraries needed to scrape data from the web
&lt;/h2&gt;

&lt;p&gt;&lt;a href="https://media2.dev.to/dynamic/image/width=800%2Cheight=%2Cfit=scale-down%2Cgravity=auto%2Cformat=auto/https%3A%2F%2Fdev-to-uploads.s3.amazonaws.com%2Fuploads%2Farticles%2Fm3vnurx49u7ewseb17x6.png" class="article-body-image-wrapper"&gt;&lt;img src="https://media2.dev.to/dynamic/image/width=800%2Cheight=%2Cfit=scale-down%2Cgravity=auto%2Cformat=auto/https%3A%2F%2Fdev-to-uploads.s3.amazonaws.com%2Fuploads%2Farticles%2Fm3vnurx49u7ewseb17x6.png" alt="tools and libraries needed to scrape data" width="648" height="344"&gt;&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;The following tools and libraries are essential to scrape data from the web:&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;Python&lt;/li&gt;
&lt;/ul&gt;

&lt;p&gt;Python is a high level programming language used to scrape data from the web since it contains powerful libraries.&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;Requests&lt;/li&gt;
&lt;/ul&gt;

&lt;p&gt;Requests is a python library used to make HTTP request to a specific URL and returns the response. To install requests in Jupyter notebook:&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight python"&gt;&lt;code&gt;&lt;span class="n"&gt;pip&lt;/span&gt; &lt;span class="n"&gt;install&lt;/span&gt; &lt;span class="n"&gt;requests&lt;/span&gt;
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;Example of making a request to URL and getting response&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight python"&gt;&lt;code&gt;&lt;span class="kn"&gt;import&lt;/span&gt; &lt;span class="n"&gt;requests&lt;/span&gt;

&lt;span class="n"&gt;url&lt;/span&gt; &lt;span class="o"&gt;=&lt;/span&gt; &lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="s"&gt;https://en.wikipedia.org/wiki/Motion_Picture_Association_film_rating_system&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;

&lt;span class="n"&gt;response&lt;/span&gt; &lt;span class="o"&gt;=&lt;/span&gt; &lt;span class="n"&gt;requests&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="nf"&gt;get&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="n"&gt;url&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt;
&lt;span class="nf"&gt;print&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="n"&gt;response&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="n"&gt;content&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt;
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;ul&gt;
&lt;li&gt;BeautifulSoup&lt;/li&gt;
&lt;/ul&gt;

&lt;p&gt;BeautifulSoup makes it easy to parse HTML and XML documents and extract data from them. To install beautifulsoup on jupyter notebook :-&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight python"&gt;&lt;code&gt;&lt;span class="n"&gt;pip&lt;/span&gt; &lt;span class="n"&gt;install&lt;/span&gt; &lt;span class="n"&gt;beautifulsoup4&lt;/span&gt;
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;A simple python program that extracts data using beautifulsoup&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight python"&gt;&lt;code&gt;&lt;span class="c1"&gt;# Import requests to send a request to the url
# Import beautifulsoup to extract data from html documents
&lt;/span&gt;&lt;span class="kn"&gt;import&lt;/span&gt; &lt;span class="n"&gt;requests&lt;/span&gt;
&lt;span class="kn"&gt;from&lt;/span&gt; &lt;span class="n"&gt;bs4&lt;/span&gt; &lt;span class="kn"&gt;import&lt;/span&gt; &lt;span class="n"&gt;BeautifulSoup&lt;/span&gt;


&lt;span class="c1"&gt;# Making a GET request
&lt;/span&gt;&lt;span class="n"&gt;url&lt;/span&gt; &lt;span class="o"&gt;=&lt;/span&gt; &lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="s"&gt;https://en.wikipedia.org/wiki/Motion_Picture_Association_film_rating_system&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;

&lt;span class="n"&gt;response&lt;/span&gt; &lt;span class="o"&gt;=&lt;/span&gt; &lt;span class="n"&gt;requests&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="nf"&gt;get&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="n"&gt;url&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt;

&lt;span class="c1"&gt;# Parsing the HTML
&lt;/span&gt;&lt;span class="n"&gt;soup&lt;/span&gt; &lt;span class="o"&gt;=&lt;/span&gt; &lt;span class="nc"&gt;BeautifulSoup&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="n"&gt;response&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="n"&gt;content&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt; &lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="s"&gt;html.parser&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt;

&lt;span class="c1"&gt;# find paragraphs in the class_name of your choice 
&lt;/span&gt;&lt;span class="n"&gt;paragraphs&lt;/span&gt; &lt;span class="o"&gt;=&lt;/span&gt; &lt;span class="n"&gt;soup&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="nf"&gt;find_all&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="s"&gt;p&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="p"&gt;,&lt;/span&gt; &lt;span class="n"&gt;class_&lt;/span&gt;&lt;span class="o"&gt;=&lt;/span&gt; &lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="s"&gt;Class name from html code&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt;

&lt;span class="c1"&gt;## Loop through each paragraph and print the text
&lt;/span&gt;&lt;span class="k"&gt;for&lt;/span&gt; &lt;span class="n"&gt;paragraph&lt;/span&gt; &lt;span class="ow"&gt;in&lt;/span&gt; &lt;span class="n"&gt;paragraphs&lt;/span&gt;&lt;span class="p"&gt;:&lt;/span&gt;
    &lt;span class="nf"&gt;print&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="n"&gt;paragraph&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="n"&gt;text&lt;/span&gt;&lt;span class="p"&gt;))&lt;/span&gt;
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;ul&gt;
&lt;li&gt;Scrapy&lt;/li&gt;
&lt;/ul&gt;

&lt;p&gt;Scrapy is free and open source web crawling framework written in python. Unlike beautifulsoup which is used in parsing html, Scrapy handles everything from requests and parsing to data storage and handling crawling rules. Scrapy allows developers to efficiently crawl web pages and extract the desired information.&lt;br&gt;
To install scrapy in your jupyter notebook&lt;br&gt;
&lt;/p&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight python"&gt;&lt;code&gt;&lt;span class="n"&gt;pip&lt;/span&gt; &lt;span class="n"&gt;install&lt;/span&gt; &lt;span class="n"&gt;scrapy&lt;/span&gt;
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;h2&gt;
  
  
  Steps to Follow during web scrapping
&lt;/h2&gt;

&lt;p&gt;&lt;a href="https://media2.dev.to/dynamic/image/width=800%2Cheight=%2Cfit=scale-down%2Cgravity=auto%2Cformat=auto/https%3A%2F%2Fdev-to-uploads.s3.amazonaws.com%2Fuploads%2Farticles%2Fb7fcur3cjy1ejefdasxi.jpg" class="article-body-image-wrapper"&gt;&lt;img src="https://media2.dev.to/dynamic/image/width=800%2Cheight=%2Cfit=scale-down%2Cgravity=auto%2Cformat=auto/https%3A%2F%2Fdev-to-uploads.s3.amazonaws.com%2Fuploads%2Farticles%2Fb7fcur3cjy1ejefdasxi.jpg" alt="Steps to Follow during web scrapping" width="600" height="400"&gt;&lt;/a&gt;&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;Identify the website to extract data from and ensure it has the data you need to perform data analysis&lt;/li&gt;
&lt;li&gt;Inspect the web page - This is done by first opening the web page, then right click and choose inspect to inspect the HTML structure of the web page and locate the data you will need.&lt;/li&gt;
&lt;li&gt;Make a HTTP request to the web page using the request library.&lt;/li&gt;
&lt;li&gt;Using beautiful soup parse HTML content to find the necessary data you shall need&lt;/li&gt;
&lt;li&gt;Once you have located the data, extract it and store it in a suitable format such as CSV or JSON&lt;/li&gt;
&lt;/ul&gt;

&lt;h2&gt;
  
  
  Web scrapping best practices
&lt;/h2&gt;

&lt;p&gt;&lt;a href="https://media2.dev.to/dynamic/image/width=800%2Cheight=%2Cfit=scale-down%2Cgravity=auto%2Cformat=auto/https%3A%2F%2Fdev-to-uploads.s3.amazonaws.com%2Fuploads%2Farticles%2Fvh1h9tvzrzs85lnwvzx9.png" class="article-body-image-wrapper"&gt;&lt;img src="https://media2.dev.to/dynamic/image/width=800%2Cheight=%2Cfit=scale-down%2Cgravity=auto%2Cformat=auto/https%3A%2F%2Fdev-to-uploads.s3.amazonaws.com%2Fuploads%2Farticles%2Fvh1h9tvzrzs85lnwvzx9.png" alt="Web Scraping best practices" width="720" height="394"&gt;&lt;/a&gt;&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;Avoid making too many requests at the same time as this may overload the server making the website slow for other users&lt;/li&gt;
&lt;li&gt;Write reusable functions to make your code more readable and easy to maintain&lt;/li&gt;
&lt;li&gt;Ensure you handle and manage errors efficiently such as handling missing data&lt;/li&gt;
&lt;li&gt;When making requests, include a User-Agent header to mimic a regular browser and avoid being blocked.&lt;/li&gt;
&lt;/ul&gt;

&lt;h2&gt;
  
  
  Conclusion
&lt;/h2&gt;

&lt;p&gt;Web scraping can be used to extract a lot of information from the web. Ensure to scrape data ethically and responsibly by following the outlined practices above. Happy Scraping!&lt;/p&gt;

&lt;p&gt;Check out my recent project where I used YouTube API to extract data on podcasts channels in Kenya on &lt;a href="https://www.linkedin.com/feed/update/urn:li:activity:7254364035423330306?utm_source=share&amp;amp;utm_medium=member_desktop" rel="noopener noreferrer"&gt;Linked IN&lt;/a&gt;   &lt;/p&gt;

&lt;p&gt;lets connect&lt;a href="https://www.linkedin.com/in/allan-mwangi-data-analyst/" rel="noopener noreferrer"&gt; Linked IN&lt;/a&gt;&lt;/p&gt;

</description>
      <category>python</category>
      <category>webscraping</category>
      <category>datascience</category>
      <category>data</category>
    </item>
    <item>
      <title>5 Essential Excel Features Every Data Analyst Should Master</title>
      <dc:creator>allan-pg</dc:creator>
      <pubDate>Fri, 18 Oct 2024 06:47:02 +0000</pubDate>
      <link>https://forem.com/allan-pg/5-essential-excel-features-every-data-analyst-should-master-3igg</link>
      <guid>https://forem.com/allan-pg/5-essential-excel-features-every-data-analyst-should-master-3igg</guid>
      <description>&lt;h2&gt;
  
  
  Introduction
&lt;/h2&gt;

&lt;p&gt;&lt;strong&gt;Microsoft excel&lt;/strong&gt; is a very powerful tool when it comes to organizing, manipulating, analyzing and visualization of data. From large businesses to small owned business, Excel remains a go to tool that handles simple calculations to advanced data modelling. Here are five key features of excel every data analyst should know and use&lt;/p&gt;

&lt;h2&gt;
  
  
  Pivot Tables
&lt;/h2&gt;

&lt;p&gt;Pivot tables are used to summarize large datasets to derive meaningful information from them. As a data analyst, pivot tables will help you extract insights and identify trends easily without the need of writing complex formulas.&lt;/p&gt;

&lt;h3&gt;
  
  
  Why are Pivot Tables important?
&lt;/h3&gt;

&lt;ul&gt;
&lt;li&gt;Using pivot tables, one can extract insights without the need to write complex formulas&lt;/li&gt;
&lt;li&gt;Pivot tables helps to save time since analyze and organize large amounts of data quickly and efficiently.&lt;/li&gt;
&lt;li&gt;Pivot tables when combined with pivot charts makes it easier to create interactive dashboards that can be used to identify trends.&lt;/li&gt;
&lt;/ul&gt;

&lt;h2&gt;
  
  
  Conditional Formatting
&lt;/h2&gt;

&lt;p&gt;Conditional formatting is used to highlight cells in excel based on the specified criteria. You can apply colors, icons, or data bars based on conditions, making it easier to interpret results quickly.&lt;/p&gt;

&lt;h3&gt;
  
  
  Why is conditional formatting a useful feature in excel?
&lt;/h3&gt;

&lt;ul&gt;
&lt;li&gt;Conditional formatting makes data more visible by use of colors, data bars and icons thus making it quick to locate data&lt;/li&gt;
&lt;li&gt;By using alerts that flag values meeting certain criteria, a data analyst can be able to prevent issues such as missed deadlines&lt;/li&gt;
&lt;li&gt;Helps in decision-making by drawing attention to trends, outliers, and errors.&lt;/li&gt;
&lt;li&gt;Conditional formatting can highlight duplicates making it easier for data cleaning process&lt;/li&gt;
&lt;/ul&gt;

&lt;h2&gt;
  
  
  Power Query
&lt;/h2&gt;

&lt;p&gt;Using power query, one cam import data from different sources for example, databases so as to clean and transform data from these sources without the need of hard coding.&lt;/p&gt;

&lt;h3&gt;
  
  
  In what ways does Power Query enhance the data analysis process?
&lt;/h3&gt;

&lt;ul&gt;
&lt;li&gt;Handles large datasets and supports importation of data from multiple sources&lt;/li&gt;
&lt;li&gt;Power query automates repetitive tasks in data cleaning making it an error free process&lt;/li&gt;
&lt;li&gt;Power query allows a data analyst to perform complex data transformation via a user friendly interface that does not require writing complex queries.&lt;/li&gt;
&lt;/ul&gt;

&lt;h2&gt;
  
  
  Data Validation
&lt;/h2&gt;

&lt;p&gt;Data validation helps to restrict the type of data to be inputted in a cell, print an error message in case the data entered is incorrect and create drop down list to choose from.&lt;/p&gt;

&lt;h3&gt;
  
  
  What are the key benefits of data validation?
&lt;/h3&gt;

&lt;ul&gt;
&lt;li&gt;Ensures data integrity because it helps prevent incorrect data from being inputted in your data thus reducing time that would have been used to clean the data.&lt;/li&gt;
&lt;li&gt;Ensures that data entered follows a consistent format. For example date formats can be set to (YYYY-MM-dd)&lt;/li&gt;
&lt;/ul&gt;

&lt;h2&gt;
  
  
  Look up Functions
&lt;/h2&gt;

&lt;p&gt;Lookup functions help you search for data in one table and return matching results from another. Look up functions include:**&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;h-lookup&lt;/li&gt;
&lt;li&gt;x-lookup&lt;/li&gt;
&lt;li&gt;Indexmatch&lt;/li&gt;
&lt;li&gt;v-lookup**&lt;/li&gt;
&lt;/ul&gt;

&lt;h3&gt;
  
  
  What makes Look Up functions a critical feature in data analysis?
&lt;/h3&gt;

&lt;ul&gt;
&lt;li&gt;It helps in faster retrieval of data needed since by use of look up functions one can located data needed quickly.&lt;/li&gt;
&lt;li&gt;Lookup functions enable you to join or merge data from different tables or sheets by matching related columns (like IDs or product codes).&lt;/li&gt;
&lt;/ul&gt;

&lt;h2&gt;
  
  
  Conclusion
&lt;/h2&gt;

&lt;p&gt;mastering these five Excel features ensures a data analyst can work efficiently and effectively with large data sets. By leveraging these tools, data analysts can derive meaningful information and help businesses in better decision making.&lt;/p&gt;

</description>
      <category>datascience</category>
      <category>data</category>
      <category>analytics</category>
      <category>excel</category>
    </item>
    <item>
      <title>Handling Outliers 101: Why the IQR Method is Your Go-To Tool</title>
      <dc:creator>allan-pg</dc:creator>
      <pubDate>Thu, 10 Oct 2024 20:18:00 +0000</pubDate>
      <link>https://forem.com/allan-pg/handling-outliers-in-python-iqr-method-5afh</link>
      <guid>https://forem.com/allan-pg/handling-outliers-in-python-iqr-method-5afh</guid>
      <description>&lt;h2&gt;
  
  
  Introduction
&lt;/h2&gt;

&lt;p&gt;Before uncovering any insights from real-world data, it is important to scrutinize your data to ensure that data is consistent and free from errors. However, Data can contain errors and some values may appear to differ from other values and these values are known as outliers. Outliers negatively impact data analysis leading to wrong insights which lead to poor decision making by stake holders. Therefore, dealing with outliers is a critical step in the data preprocessing stage in data science. In this article, we will asses IQR method of handling outliers.&lt;/p&gt;

&lt;h2&gt;
  
  
  Outliers
&lt;/h2&gt;

&lt;p&gt;Outliers are data points that differ significantly from the majority of the data points in a dataset. They are values that fall outside the expected or usual range of values for a particular variable. outliers occur due to various reason for example, error during data entry, sampling errors. In machine learning outliers can cause your models to make incorrect predictions thus causing inaccurate predictions.&lt;/p&gt;

&lt;h2&gt;
  
  
  Detecting outliers in a dataset using Jupyter notebook
&lt;/h2&gt;

&lt;ul&gt;
&lt;li&gt;Import python libraries
&lt;/li&gt;
&lt;/ul&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight python"&gt;&lt;code&gt;&lt;span class="kn"&gt;import&lt;/span&gt; &lt;span class="n"&gt;pandas&lt;/span&gt; &lt;span class="k"&gt;as&lt;/span&gt; &lt;span class="n"&gt;pd&lt;/span&gt;
&lt;span class="kn"&gt;import&lt;/span&gt; &lt;span class="n"&gt;matplotlib.pyplot&lt;/span&gt; &lt;span class="k"&gt;as&lt;/span&gt; &lt;span class="n"&gt;plt&lt;/span&gt;
&lt;span class="kn"&gt;import&lt;/span&gt; &lt;span class="n"&gt;seaborn&lt;/span&gt; &lt;span class="k"&gt;as&lt;/span&gt; &lt;span class="n"&gt;sns&lt;/span&gt;
&lt;span class="kn"&gt;import&lt;/span&gt; &lt;span class="n"&gt;warnings&lt;/span&gt;
&lt;span class="n"&gt;warnings&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="nf"&gt;filterwarnings&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="s"&gt;ignore&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt;
&lt;span class="n"&gt;plt&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="n"&gt;style&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="nf"&gt;use&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="s"&gt;ggplot&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt;
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;ul&gt;
&lt;li&gt;Load your csv file using pandas
&lt;/li&gt;
&lt;/ul&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight python"&gt;&lt;code&gt;&lt;span class="n"&gt;df_house_price&lt;/span&gt; &lt;span class="o"&gt;=&lt;/span&gt; &lt;span class="n"&gt;pd&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="nf"&gt;read_csv&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="sa"&gt;r&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="s"&gt;C:\Users\Admin\Desktop\csv files\housePrice.csv&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt;
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;ul&gt;
&lt;li&gt;Check the first five rows of house prices data set to have a glimpse of your datafrane
&lt;/li&gt;
&lt;/ul&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight python"&gt;&lt;code&gt;&lt;span class="n"&gt;df_house_price&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="nf"&gt;head&lt;/span&gt;&lt;span class="p"&gt;()&lt;/span&gt;
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;&lt;a href="https://media2.dev.to/dynamic/image/width=800%2Cheight=%2Cfit=scale-down%2Cgravity=auto%2Cformat=auto/https%3A%2F%2Fdev-to-uploads.s3.amazonaws.com%2Fuploads%2Farticles%2Fk6ghpwypjwc6clibuw91.png" class="article-body-image-wrapper"&gt;&lt;img src="https://media2.dev.to/dynamic/image/width=800%2Cheight=%2Cfit=scale-down%2Cgravity=auto%2Cformat=auto/https%3A%2F%2Fdev-to-uploads.s3.amazonaws.com%2Fuploads%2Farticles%2Fk6ghpwypjwc6clibuw91.png" alt="First Five rows of your dataframe" width="736" height="182"&gt;&lt;/a&gt;&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;Check for outliers in the price column by use of a box plot
&lt;/li&gt;
&lt;/ul&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight python"&gt;&lt;code&gt;&lt;span class="n"&gt;sns&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="nf"&gt;boxplot&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="n"&gt;df_house_price&lt;/span&gt;&lt;span class="p"&gt;[&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="s"&gt;Price&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="p"&gt;])&lt;/span&gt;
&lt;span class="n"&gt;plt&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="nf"&gt;title&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="s"&gt;Box plot showing outliers in prices&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt;
&lt;span class="n"&gt;plt&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="nf"&gt;show&lt;/span&gt;&lt;span class="p"&gt;()&lt;/span&gt;
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;&lt;a href="https://media2.dev.to/dynamic/image/width=800%2Cheight=%2Cfit=scale-down%2Cgravity=auto%2Cformat=auto/https%3A%2F%2Fdev-to-uploads.s3.amazonaws.com%2Fuploads%2Farticles%2Fnzv6jtp3hgd6g11xpork.png" class="article-body-image-wrapper"&gt;&lt;img src="https://media2.dev.to/dynamic/image/width=800%2Cheight=%2Cfit=scale-down%2Cgravity=auto%2Cformat=auto/https%3A%2F%2Fdev-to-uploads.s3.amazonaws.com%2Fuploads%2Farticles%2Fnzv6jtp3hgd6g11xpork.png" alt="Box plot" width="709" height="455"&gt;&lt;/a&gt;&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;From the box plot visualization the price column has outlier values &lt;/li&gt;
&lt;li&gt;Now we have to come up with ways to handle these outlier values to ensure better decision making and ensure machine learning models make the correct prediction&lt;/li&gt;
&lt;/ul&gt;

&lt;h2&gt;
  
  
  IQR Method of handling outlier values
&lt;/h2&gt;

&lt;ul&gt;
&lt;li&gt;IQR method means interquartile range measures the spread of the middle half of your data. It is the range for the middle 50% of your sample.&lt;/li&gt;
&lt;/ul&gt;

&lt;h2&gt;
  
  
  Steps for removing outliers using interquartile range
&lt;/h2&gt;

&lt;ul&gt;
&lt;li&gt;Calculate the first quartile (Q1) which is 25% of the data and the third quartile (Q3) which is 75% of the data.
&lt;/li&gt;
&lt;/ul&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight python"&gt;&lt;code&gt;&lt;span class="n"&gt;Q1&lt;/span&gt; &lt;span class="o"&gt;=&lt;/span&gt; &lt;span class="n"&gt;df_house_price&lt;/span&gt;&lt;span class="p"&gt;[&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="s"&gt;Price&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="p"&gt;].&lt;/span&gt;&lt;span class="nf"&gt;quantile&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="mf"&gt;0.25&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt;
&lt;span class="n"&gt;Q3&lt;/span&gt; &lt;span class="o"&gt;=&lt;/span&gt; &lt;span class="n"&gt;df_house_price&lt;/span&gt;&lt;span class="p"&gt;[&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="s"&gt;Price&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="p"&gt;].&lt;/span&gt;&lt;span class="nf"&gt;quantile&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="mf"&gt;0.75&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt;
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;ul&gt;
&lt;li&gt;compute the interquartile range
&lt;/li&gt;
&lt;/ul&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight python"&gt;&lt;code&gt;&lt;span class="n"&gt;IQR&lt;/span&gt; &lt;span class="o"&gt;=&lt;/span&gt; &lt;span class="n"&gt;Q3&lt;/span&gt; &lt;span class="o"&gt;-&lt;/span&gt; &lt;span class="n"&gt;Q1&lt;/span&gt;
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;ul&gt;
&lt;li&gt;Determine the outlier boundaries.
&lt;/li&gt;
&lt;/ul&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight python"&gt;&lt;code&gt;&lt;span class="n"&gt;lower_bound&lt;/span&gt; &lt;span class="o"&gt;=&lt;/span&gt; &lt;span class="n"&gt;Q1&lt;/span&gt; &lt;span class="o"&gt;-&lt;/span&gt; &lt;span class="mf"&gt;1.5&lt;/span&gt; &lt;span class="o"&gt;*&lt;/span&gt; &lt;span class="n"&gt;IQR&lt;/span&gt;
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;&lt;a href="https://media2.dev.to/dynamic/image/width=800%2Cheight=%2Cfit=scale-down%2Cgravity=auto%2Cformat=auto/https%3A%2F%2Fdev-to-uploads.s3.amazonaws.com%2Fuploads%2Farticles%2F0qog3qifilqx4rex0l95.png" class="article-body-image-wrapper"&gt;&lt;img src="https://media2.dev.to/dynamic/image/width=800%2Cheight=%2Cfit=scale-down%2Cgravity=auto%2Cformat=auto/https%3A%2F%2Fdev-to-uploads.s3.amazonaws.com%2Fuploads%2Farticles%2F0qog3qifilqx4rex0l95.png" alt="Lower Bound" width="526" height="79"&gt;&lt;/a&gt;&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;Lower bound means any value below -5454375000.0 is an outlier
&lt;/li&gt;
&lt;/ul&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight python"&gt;&lt;code&gt;&lt;span class="n"&gt;upper_bound&lt;/span&gt; &lt;span class="o"&gt;=&lt;/span&gt; &lt;span class="n"&gt;Q3&lt;/span&gt; &lt;span class="o"&gt;+&lt;/span&gt; &lt;span class="mf"&gt;1.5&lt;/span&gt; &lt;span class="o"&gt;*&lt;/span&gt; &lt;span class="n"&gt;IQR&lt;/span&gt;
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;&lt;a href="https://media2.dev.to/dynamic/image/width=800%2Cheight=%2Cfit=scale-down%2Cgravity=auto%2Cformat=auto/https%3A%2F%2Fdev-to-uploads.s3.amazonaws.com%2Fuploads%2Farticles%2F0v3sqtihcvnnumo8p3ds.png" class="article-body-image-wrapper"&gt;&lt;img src="https://media2.dev.to/dynamic/image/width=800%2Cheight=%2Cfit=scale-down%2Cgravity=auto%2Cformat=auto/https%3A%2F%2Fdev-to-uploads.s3.amazonaws.com%2Fuploads%2Farticles%2F0v3sqtihcvnnumo8p3ds.png" alt="upper bound" width="593" height="66"&gt;&lt;/a&gt;&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;&lt;p&gt;Upper bound means any value above 12872625000.0 is an outlier&lt;/p&gt;&lt;/li&gt;
&lt;li&gt;&lt;p&gt;Remove outlier values in the price column&lt;br&gt;
&lt;/p&gt;&lt;/li&gt;
&lt;/ul&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight python"&gt;&lt;code&gt;&lt;span class="n"&gt;filt&lt;/span&gt; &lt;span class="o"&gt;=&lt;/span&gt; &lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="n"&gt;df_house_price&lt;/span&gt;&lt;span class="p"&gt;[&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="s"&gt;Price&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="p"&gt;]&lt;/span&gt; &lt;span class="o"&gt;&amp;gt;=&lt;/span&gt; &lt;span class="n"&gt;lower_bound&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt; &lt;span class="o"&gt;&amp;amp;&lt;/span&gt; &lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="n"&gt;df_house_price&lt;/span&gt;&lt;span class="p"&gt;[&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="s"&gt;Price&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="p"&gt;]&lt;/span&gt; &lt;span class="o"&gt;&amp;lt;=&lt;/span&gt; &lt;span class="n"&gt;upper_bound&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt;

&lt;span class="n"&gt;df&lt;/span&gt; &lt;span class="o"&gt;=&lt;/span&gt; &lt;span class="n"&gt;df_house_price&lt;/span&gt;&lt;span class="p"&gt;[&lt;/span&gt;&lt;span class="n"&gt;filt&lt;/span&gt;&lt;span class="p"&gt;]&lt;/span&gt;
&lt;span class="n"&gt;df&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="nf"&gt;head&lt;/span&gt;&lt;span class="p"&gt;()&lt;/span&gt;
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;&lt;a href="https://media2.dev.to/dynamic/image/width=800%2Cheight=%2Cfit=scale-down%2Cgravity=auto%2Cformat=auto/https%3A%2F%2Fdev-to-uploads.s3.amazonaws.com%2Fuploads%2Farticles%2Fr14at62dbihhhcmlfs4t.png" class="article-body-image-wrapper"&gt;&lt;img src="https://media2.dev.to/dynamic/image/width=800%2Cheight=%2Cfit=scale-down%2Cgravity=auto%2Cformat=auto/https%3A%2F%2Fdev-to-uploads.s3.amazonaws.com%2Fuploads%2Farticles%2Fr14at62dbihhhcmlfs4t.png" alt="clean dataframe" width="708" height="161"&gt;&lt;/a&gt;&lt;/p&gt;

&lt;ul&gt;
&lt;li&gt;Box plot After removing outliers
&lt;/li&gt;
&lt;/ul&gt;

&lt;div class="highlight js-code-highlight"&gt;
&lt;pre class="highlight python"&gt;&lt;code&gt;&lt;span class="n"&gt;sns&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="nf"&gt;boxplot&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="n"&gt;df&lt;/span&gt;&lt;span class="p"&gt;[&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="s"&gt;Price&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="p"&gt;])&lt;/span&gt;
&lt;span class="n"&gt;plt&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="nf"&gt;title&lt;/span&gt;&lt;span class="p"&gt;(&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="s"&gt;Box plot after removing outliers&lt;/span&gt;&lt;span class="sh"&gt;'&lt;/span&gt;&lt;span class="p"&gt;)&lt;/span&gt;
&lt;span class="n"&gt;plt&lt;/span&gt;&lt;span class="p"&gt;.&lt;/span&gt;&lt;span class="nf"&gt;show&lt;/span&gt;&lt;span class="p"&gt;()&lt;/span&gt;
&lt;/code&gt;&lt;/pre&gt;

&lt;/div&gt;



&lt;p&gt;&lt;a href="https://media2.dev.to/dynamic/image/width=800%2Cheight=%2Cfit=scale-down%2Cgravity=auto%2Cformat=auto/https%3A%2F%2Fdev-to-uploads.s3.amazonaws.com%2Fuploads%2Farticles%2F0dyz0nnudp0ive5uzwic.png" class="article-body-image-wrapper"&gt;&lt;img src="https://media2.dev.to/dynamic/image/width=800%2Cheight=%2Cfit=scale-down%2Cgravity=auto%2Cformat=auto/https%3A%2F%2Fdev-to-uploads.s3.amazonaws.com%2Fuploads%2Farticles%2F0dyz0nnudp0ive5uzwic.png" alt="Box plot without outliers" width="778" height="407"&gt;&lt;/a&gt;&lt;/p&gt;

&lt;h2&gt;
  
  
  Different methods of handling outlier values
&lt;/h2&gt;

&lt;ul&gt;
&lt;li&gt;Z-Score method&lt;/li&gt;
&lt;li&gt;Percentile Capping (Winsorizing)&lt;/li&gt;
&lt;li&gt;Trimming (Truncation)&lt;/li&gt;
&lt;li&gt;Imputation&lt;/li&gt;
&lt;li&gt;Clustering-Based Methods e.g DBSCAN&lt;/li&gt;
&lt;/ul&gt;

&lt;h2&gt;
  
  
  Conclusion
&lt;/h2&gt;

&lt;p&gt;IQR method is simple and robust to outliers and does not depend on the normality assumption. The disadvantage is that it can only handle univariate data, and that it can remove valid data points if the data is skewed or has heavy tails.&lt;/p&gt;

&lt;p&gt;Thank you &lt;br&gt;
follow me on &lt;a href="https://www.linkedin.com/in/allan-mwangi-data-analyst/" rel="noopener noreferrer"&gt;linked in&lt;/a&gt; and on &lt;a href="https://github.com/allan-pg" rel="noopener noreferrer"&gt;github&lt;/a&gt; for more.&lt;/p&gt;

</description>
      <category>python</category>
      <category>datascience</category>
      <category>dataengineering</category>
      <category>data</category>
    </item>
  </channel>
</rss>
