Since wikipedia contains a plethora of information about many different topics, it was an ideal source for gathering comprehensive text data regarding electric vehicles

Web Scraping from Wikipedia

Step 1

importing the essential python libraries:

Show the code
import wikipedia
import nltk
import string 
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk.sentiment import SentimentIntensityAnalyzer
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
Show the code
nltk.download('vader_lexicon')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/isfarbaset/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/isfarbaset/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/isfarbaset/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/isfarbaset/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
True

Step 2

Setting the parameters for the text processing task and intializing the tools for NLP tasks:

Show the code
# PARAMETERS 
label_list=['electric vehicle', 'gasoline-powered']
max_num_pages=25
sentence_per_chunk=5
min_sentence_length=20

# GET STOPWORDS
# from nltk.corpus import stopwords
stop_words=nltk.corpus.stopwords.words('english')

# INITALIZE STEMMER+LEMITZIZER+SIA
sia = SentimentIntensityAnalyzer()
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

Step 3

The following function takes a string input and filters out any unwanted characterers and performs tasks such as lemmatization to output a more readable version of the raw, unprocessed text

Show the code
def clean_string(text):
    # #FILTER OUT UNWANTED CHAR
    new_text=""
    # keep=string.printable
    keep=" abcdefghijklmnopqrstuvwxyz0123456789"
    for character in text:
        if character.lower() in keep:
            new_text+=character.lower()
        else: 
            new_text+=" "
    text=new_text
    # print(text)

    # #FILTER OUT UNWANTED WORDS
    new_text=""
    for word in nltk.tokenize.word_tokenize(text):
        if word not in nltk.corpus.stopwords.words('english'):
            #lemmatize 
            tmp=lemmatizer.lemmatize(word)
            # tmp=stemmer.stem(tmp)

            # update word if there is a change
            # if(tmp!=word): print(tmp,word)
            
            word=tmp
            if len(word)>1:
                if word in [".",",","!","?",":",";"]:
                    #remove the last space
                    new_text=new_text[0:-1]+word+" "
                else: #add a space
                    new_text+=word.lower()+" "
    text=new_text.strip()
    return text

Step 4

In the following code snippet, Wikipedia is scraped for text data contaning the specific keywords and sentiments are set to the tedxt chunks to facilitate conducting sentiments analysis later on

Show the code
#INITIALIZE 
corpus=[]  # list of strings (input variables X)
targets=[] # list of targets (labels or response variables Y)

#--------------------------
# LOOP OVER TOPICS 
#--------------------------
for label in label_list:

    #SEARCH FOR RELEVANT PAGES 
    titles=wikipedia.search(label,results=max_num_pages)
    print("Pages for label =",label,":",titles)

    #LOOP OVER WIKI-PAGES
    for title in titles:
        try:
            print(" ",title)
            wiki_page = wikipedia.page(title, auto_suggest=True)

            # LOOP OVER SECTIONS IN ARTICLE AND GET PAGE TEXT
            for section in wiki_page.sections:
                text=wiki_page.section(section); #print(text)

                #BREAK IN TO SENTANCES 
                sentences=nltk.tokenize.sent_tokenize(text)
                counter=0
                text_chunk=''

                #LOOP OVER SENTENCES 
                for sentence in sentences:
                    if len(sentence)>min_sentence_length:
                        if(counter%sentence_per_chunk==0 and counter!=0):
                            # PROCESS COMPLETED CHUNK 
                            
                            # CLEAN STRING
                            text_chunk=clean_string(text_chunk)

                            # REMOVE LABEL IF IN STRING (MAKES IT TOO EASY)
                            text_chunk=text_chunk.replace(label,"")
                            
                            # REMOVE ANY DOUBLE SPACES
                            text_chunk=' '.join(text_chunk.split()).strip()

                            #UPDATE CORPUS 
                            corpus.append(text_chunk)

                            #UPDATE TARGETS
                            score=sia.polarity_scores(text_chunk)
                            target=[label,score['compound']]
                            targets.append(target)

                            #print("TEXT\n",text_chunk,target)

                            # RESET CHUNK FOR NEXT ITERATION 
                            text_chunk=sentence
                        else:
                            text_chunk+=sentence
                        #print("--------\n", sentence)
                        counter+=1

        except:
            print("WARNING: SOMETHING WENT WRONG:", title);  
Pages for label = electric vehicle : ['Electric vehicle', 'History of the electric vehicle', 'Battery electric vehicle', 'Electric vehicle battery', 'Hybrid electric vehicle', 'Electric car use by country', 'Plug-in electric vehicle', 'List of production battery electric vehicles', 'Neighborhood Electric Vehicle', 'Hybrid vehicle drivetrain', 'Aptera (solar electric vehicle)', 'Electric car', 'Capacitor electric vehicle', 'Electric vehicle conversion', 'Hybrid vehicle', 'Plug-in hybrid', 'Charging station', 'Citroën Ami (electric vehicle)', 'Grumman LLV', 'Fuel cell vehicle', 'London Electric Vehicle Company', 'Electric vehicle industry in China', 'Electric Vehicle Company', 'Tesla, Inc.', 'Plug-in electric vehicles in China']
     Electric vehicle
     History of the electric vehicle
     Battery electric vehicle
     Electric vehicle battery
     Hybrid electric vehicle
     Electric car use by country
     Plug-in electric vehicle
     List of production battery electric vehicles
     Neighborhood Electric Vehicle
     Hybrid vehicle drivetrain
     Aptera (solar electric vehicle)
     Electric car
     Capacitor electric vehicle
     Electric vehicle conversion
     Hybrid vehicle
     Plug-in hybrid
     Charging station
WARNING: SOMETHING WENT WRONG: Charging station
     Citroën Ami (electric vehicle)
     Grumman LLV
     Fuel cell vehicle
WARNING: SOMETHING WENT WRONG: Fuel cell vehicle
     London Electric Vehicle Company
     Electric vehicle industry in China
     Electric Vehicle Company
     Tesla, Inc.
     Plug-in electric vehicles in China
Pages for label = gasoline-powered : ['Petrol engine', 'Tractor', 'Charles Duryea', 'Gasoline', 'History of the automobile', 'Waterloo Gasoline Engine Company', 'Chainsaw', 'Lawn mower', 'Catalytic converter', 'ZF S6-37 transmission', 'Mercedes-Benz MB517 engine', 'Ford Duratec engine', 'Motorized bicycle', 'Chevrolet small-block engine', 'Mazda MZR engine', 'Mercedes-Benz MB507 engine', 'Houseboat', 'Hydrogen internal combustion engine vehicle', 'Trolling motor', 'Radio-controlled car', 'Maxus G90', 'Natural gas vehicle', 'Two-stroke oil', 'String trimmer', 'Hydrogen vehicle']
     Petrol engine
     Tractor
WARNING: SOMETHING WENT WRONG: Tractor
     Charles Duryea
     Gasoline
     History of the automobile
     Waterloo Gasoline Engine Company
     Chainsaw
     Lawn mower
     Catalytic converter
WARNING: SOMETHING WENT WRONG: Catalytic converter
     ZF S6-37 transmission
     Mercedes-Benz MB517 engine
     Ford Duratec engine
     Motorized bicycle
     Chevrolet small-block engine
     Mazda MZR engine
     Mercedes-Benz MB507 engine
     Houseboat
     Hydrogen internal combustion engine vehicle
     Trolling motor
     Radio-controlled car
     Maxus G90
     Natural gas vehicle
     Two-stroke oil
     String trimmer
     Hydrogen vehicle

Step 5

The collected text data is shaped into a dataframe and stored in a csv file:

Show the code
#SANITY CHECKS AND PRINT TO FILE 
print("number of text chunks = ",len(corpus))
print("number of targets = ",len(targets))

tmp=[]
for i in range(0,len(corpus)):
    tmp.append([corpus[i],targets[i][0],targets[i][1]])
df=pd.DataFrame(tmp)
df=df.rename(columns={0: "text", 1: "label", 2: "sentiment"})
print(df)
df.to_csv('wiki-crawl-results.csv',index=False)
number of text chunks =  723
number of targets =  723
                                                  text             label  \
0    electric motive power started 1827 hungarian p...  electric vehicle   
1    first mass produced appeared america early 190...  electric vehicle   
2    20th century uk world largest user electric ro...  electric vehicle   
3    1900 28 percent car road electric ev popular e...  electric vehicle   
4    seldom marketed woman luxury car may stigma am...  electric vehicle   
..                                                 ...               ...   
718  lifetime hydrogen vehicle emit carbon gasoline...  gasoline-powered   
719  convert hydrogen back electricity fuel cell an...  gasoline-powered   
720  2019 video real engineering noted notwithstand...  gasoline-powered   
721  maybe hydrogen fuel cell car come technology n...  gasoline-powered   
722  internal combustion engine based compressed na...  gasoline-powered   

     sentiment  
0      -0.7506  
1       0.9201  
2       0.7096  
3       0.9169  
4       0.9231  
..         ...  
718     0.9100  
719     0.9136  
720     0.7964  
721     0.9260  
722     0.9118  

[723 rows x 3 columns]

Resources:

  • https://drive.google.com/drive/folders/1O62qRfigp1T6bXVOpMg2Z_1CHn8tXh-d