from bs4 import BeautifulSoup
import re
import requests
import pandas as pd

url = 'https://www.tripadvisor.com.my/Restaurant_Review-g3533821-d8806165-Reviews-Good_View_Fresh_Seafood_Restaurant-Tawau_Tawau_Division_Sabah.html#REVIEWS'

#Set the User-Agent header
headers = {
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.71 Safari/537.36"
}

#Handle cookies
session = requests.Session()
response = session.get(url, headers=headers)
print(f'Response status code: {response.status_code}')

Response status code: 200


if response.status_code == 200:
    #Parse html structuce of webpage
    content = BeautifulSoup(response.content, 'html.parser')
    
    #Find all individual reviews
    reviews_container = content.find('div', class_='listContainer')
    reviews = reviews_container.find_all('div', class_='review-container')
    
else:
    print("Failed to retrieve the webpage")


#Extract restaurant name
name = content.select("h1")
name = name[0].text.strip()
print(name)

review_texts = []
review_ratings = []

for review in reviews:
    # Extract review text
    review_text = review.find('p', class_='partial_entry').text.strip()
    review_texts.append(review_text)

    # Extract review rating
    rating_class = review.find('span', class_='ui_bubble_rating')['class'][1]
    rating = int(rating_class.split('_')[-1]) / 10
    review_ratings.append(rating)

df = pd.DataFrame({'Review': review_texts, 'Rating': review_ratings})

#Save DataFrame to CSV file
df.to_csv('restaurant_review.csv', index=False)
df.head()

Good View Fresh Seafood Restaurant


#Punctuations to be removed
punctuations = '''\r\n!()-[]{};:'"\, <>./?@#$%^&*_~'''


# #Option 1: Get English stop words from nltk
# import nltk
# from nltk.corpus import stopwords
# nltk.download('stopwords')
# stop_words = set(stopwords.words('english'))

#Option 2: Get stopwords from stopwords_url
stopwords_url = 'https://raw.githubusercontent.com/theleadio/datascience_demo/master/stopwords.txt'
response = requests.get(stopwords_url)
stop_words= response.text.splitlines()


df = pd.read_csv("../DSproject/restaurant_review.csv")

review_filtered = []

for review in df['Review']:

  #Remove punctuations in raw 'Review'
  for character in review:
    if character in punctuations:
      review = review.replace(character, ' ')

  #Remove stopwords in raw 'Review'
  review = review.lower()
  tokens = review.split()  
  tokens_filtered = [t for t in tokens if t not in stop_words]
  review = ' '.join(tokens_filtered)
  review_filtered.append(review)


import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
nltk.download('vader_lexicon')

#Create analyzer for sentiment analysis
analyzer = SentimentIntensityAnalyzer()

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/gideon/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


sentiment = []

for review in review_filtered:
    
  #Analyze sentiment in filtered 'Review'
  sentiments = analyzer.polarity_scores(review)
  sentiment.append(sentiments['compound'])


#Attach "sentiment" list to dataframe
df['Sentiment'] = sentiment
df


#Study sentiment score relevancy to 'Review'
df.loc[7,'Review']

'Steam ginger fish mixed with butter, feel weird. Curry chicken still acceptable, and the vegetables found a hair in there..'


#Study sentiment score relevancy to 'Rating'
import seaborn as sns
import matplotlib.pyplot as plt

x = df['Rating']
y = df['Sentiment']

sns.regplot(x=x, y=y)

plt.xlabel('Rating')
plt.ylabel('Review sentiment')
plt.title('Review Sentiment vs Rating of Restaurant')
plt.show()


#Save average score on rating and review sentiment of Tawau restaurant
average_rating = df['Rating'].mean()
average_sentiment = df['Sentiment'].mean()

restaurant = {
    'Name' : [name],
    'Rating' : [average_rating],
    'Review_sentiment' : [average_sentiment]
}

print(restaurant)

{'Name': ['Good View Fresh Seafood Restaurant'], 'Rating': [3.066666666666667], 'Review_sentiment': [0.21479333333333336]}

	Review	Rating	Sentiment
0	Received recommendation from our taxi driver f...	3.0	0.6808
1	Our group of friends opted for dinner at Good...	3.0	0.9153
2	Nice place near the seaside, seafood very fres...	4.0	0.7506
3	Mostly tourists will come here to eat seafood ...	4.0	0.6369
4	Well, Despite its simple renovation, and hawke...	4.0	0.8689
5	This place is sucked, the seafood is pricy. We...	2.0	-0.1027
6	Too expensive on the food, taste average and s...	1.0	-0.5994
7	Steam ginger fish mixed with butter, feel weir...	1.0	0.1531
8	Ordered two single dished rice i.e buttered ch...	4.0	-0.5423
9	The seafood is fresh and tasty. However, price...	4.0	0.6369
10	Very bad working altitude from\nMost of their ...	3.0	-0.8271
11	This is a hawker style seafood restaurant. The...	4.0	0.5574
12	The restaurant is a food-hall style in the ope...	4.0	0.0000
13	We got the impression from previous reviews th...	4.0	0.7184
14	We were 18 person.\nTaste is good...also kind....	1.0	-0.6249

Restaurant Review Analysis¶

Restaurant Review Data¶

Data source:¶

Analysis on Unstructured Online Review Data¶

1. Scrap data from webpage¶

Check request status on URL¶

Parse data using Beautiful Soup¶

Extract restaurant name, reviews and ratings¶

Remark:¶

2. Perform natural language processing on Review data¶

Lists of punctuations and stopwords¶

Remove punctuations and stopwords in Review data¶

Remark:¶

3. Analyze sentiment of Review data¶

Analyze sentiment using nltk.sentiment¶

Remark:¶

Study sentiment score relevancy¶

Remark:¶

Save Restaurant Review Analysis for Future Reference¶

Remark:¶