import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import string


from wordcloud import WordCloud,STOPWORDS
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer


from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import classification_report
from sklearn.metrics import  f1_score, confusion_matrix
from sklearn.metrics import confusion_matrix, precision_score, recall_score
from sklearn.metrics import roc_curve
from sklearn.metrics import auc
from sklearn.metrics import plot_confusion_matrix, classification_report,accuracy_score


fake_data = pd.read_csv('Fake.csv')

true_data = pd.read_csv('True.csv')

fake_data.head()


true_data.head()


fake_data['target'] = 0
true_data['target'] = 1


fake_data.head()


news = pd.concat([true_data,fake_data], axis=0)


news


news['target'].value_counts()

0    23481
1    21417
Name: target, dtype: int64


sns.countplot(data=news, x='target')

<AxesSubplot:xlabel='target', ylabel='count'>


news['subject'].value_counts().plot(kind='barh')

<AxesSubplot:>


news['date']

0        December 31, 2017 
1        December 29, 2017 
2        December 31, 2017 
3        December 30, 2017 
4        December 29, 2017 
                ...        
23476      January 16, 2016
23477      January 16, 2016
23478      January 15, 2016
23479      January 14, 2016
23480      January 12, 2016
Name: date, Length: 44898, dtype: object


news['content']= news['title'] + " " + news['text']


news.head()


g = sns.catplot(x="subject", col="target",
                data=news, kind="count",
                height=4, aspect=2)

g.set_xticklabels(rotation=45)

<seaborn.axisgrid.FacetGrid at 0x2700bfb8640>


news=news[~news.date.str.contains("http")]
news=news[~news.date.str.contains("HOST")]


news['date']= pd.to_datetime(news['date'])


news.head()


del news['title']
del news['text']


news.head()


# Rearranging the data in the dataset

news= news[['date', 'subject','content', 'target']]


news.head()


news.isnull().sum()

date       0
subject    0
content    0
target     0
dtype: int64


plt.figure(figsize=(12,10))
news.groupby(['date'])['target'].count().plot(kind='line')

<AxesSubplot:xlabel='date'>


plt.figure(figsize=(12,10))
sns.lineplot(x=news.date, y= news.target)

<AxesSubplot:xlabel='date', ylabel='target'>


#Word cloud for real news
cloud = WordCloud(max_words = 1800, 
                  width = 1600, 
                  height = 800,
                  stopwords = STOPWORDS, 
                  background_color = "white").generate(" ".join(news[news['target'] == 1].content))
plt.figure(figsize=(15, 9))
plt.imshow(cloud, interpolation="bilinear")
plt.axis("off")
plt.tight_layout(pad=0)
plt.show()


#Word cloud for fake news
cloud = WordCloud(max_words = 1800, 
                  width = 1600, 
                  height = 800,
                  stopwords = STOPWORDS, 
                  background_color = "white").generate(" ".join(news[news['target'] == 0].content))
plt.figure(figsize=(15, 9))
plt.imshow(cloud, interpolation="bilinear")
plt.axis("off")
plt.tight_layout(pad=0)
plt.show()


X = news['content']

y = news['target']


from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=101)


t_vector = TfidfVectorizer(stop_words = 'english')


t_vector.fit(X_train)

TfidfVectorizer(stop_words='english')


train_results = t_vector.transform(X_train)

test_results = t_vector.transform(X_test)


multi_nomial = MultinomialNB()
multi_nomial.fit(train_results,y_train)

MultinomialNB()


pred = multi_nomial.predict(test_results)
print(classification_report(y_test,pred))
plot_confusion_matrix(multi_nomial,test_results, y_test)

              precision    recall  f1-score   support

           0       0.93      0.95      0.94      6977
           1       0.94      0.92      0.93      6490

    accuracy                           0.94     13467
   macro avg       0.94      0.93      0.94     13467
weighted avg       0.94      0.94      0.94     13467

C:\Users\Pce\anaconda3\lib\site-packages\sklearn\utils\deprecation.py:87: FutureWarning: Function plot_confusion_matrix is deprecated; Function `plot_confusion_matrix` is deprecated in 1.0 and will be removed in 1.2. Use one of the class methods: ConfusionMatrixDisplay.from_predictions or ConfusionMatrixDisplay.from_estimator.
  warnings.warn(msg, category=FutureWarning)

<sklearn.metrics._plot.confusion_matrix.ConfusionMatrixDisplay at 0x270128fe970>


train_score = 0
test_score = 0
test_recall = 0
test_auc = 0

train_score = multi_nomial.score(train_results, y_train)
test_score = multi_nomial.score(test_results, y_test)
y_pred_nb = multi_nomial.predict(test_results)

test_recall = recall_score(y_test, y_pred_nb)
nb_fpr, nb_tpr, thresholds = roc_curve(y_test, y_pred_nb)
test_auc = auc(nb_fpr, nb_tpr)
test_f1_score = f1_score(y_test,y_pred_nb)
test_precision_score = precision_score(y_test,y_pred_nb)

#implement AUC Curve
y_pred_proba = multi_nomial.predict_proba(test_results)[::,1]
fpr, tpr, _ = roc_curve(y_test,  y_pred_proba)

#create ROC curve
plt.plot(fpr,tpr)
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()



print("Train accuracy ", train_score)
print("Test accuracy ", test_score)
print("Test recall", test_recall)
print("Test AUC", test_auc)
print("Test f1 score", test_f1_score)
print("Test precision score", test_precision_score)

Train accuracy  0.9455777982877693
Test accuracy  0.9353976386723101
Test recall 0.9198767334360555
Test AUC 0.9348559530731947
Test f1 score 0.9320843091334895
Test precision score 0.944620253164557


rfc = RandomForestClassifier(n_estimators=100,random_state=101)

rfc.fit(train_results, y_train)

RandomForestClassifier(random_state=101)


pred_rf = rfc.predict(test_results)
print(classification_report(y_test,pred_rf))
plot_confusion_matrix(rfc,test_results, y_test)

              precision    recall  f1-score   support

           0       0.99      0.99      0.99      6977
           1       0.99      0.99      0.99      6490

    accuracy                           0.99     13467
   macro avg       0.99      0.99      0.99     13467
weighted avg       0.99      0.99      0.99     13467

C:\Users\Pce\anaconda3\lib\site-packages\sklearn\utils\deprecation.py:87: FutureWarning: Function plot_confusion_matrix is deprecated; Function `plot_confusion_matrix` is deprecated in 1.0 and will be removed in 1.2. Use one of the class methods: ConfusionMatrixDisplay.from_predictions or ConfusionMatrixDisplay.from_estimator.
  warnings.warn(msg, category=FutureWarning)

<sklearn.metrics._plot.confusion_matrix.ConfusionMatrixDisplay at 0x270246dc760>


train_score = 0
test_score = 0
test_recall = 0
test_auc = 0

train_score = rfc.score(train_results, y_train)
test_score = rfc.score(test_results, y_test)
y_pred_rf = rfc.predict(test_results)

test_recall = recall_score(y_test, y_pred_rf)
rf_fpr, rf_tpr, thresholds = roc_curve(y_test, y_pred_rf)
test_auc = auc(rf_fpr, rf_tpr)
test_f1_score = f1_score(y_test,y_pred_rf)
test_precision_score = precision_score(y_test,y_pred_rf)

#implement AUC Curve
y_pred_proba = rfc.predict_proba(test_results)[::,1]
fpr, tpr, _ = roc_curve(y_test,  y_pred_proba)

#create ROC curve
plt.plot(fpr,tpr)
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()



print("Train accuracy ", train_score)
print("Test accuracy ", test_score)
print("Test recall", test_recall)
print("Test AUC", test_auc)
print("Test f1 score", test_f1_score)
print("Test precision score", test_precision_score)

Train accuracy  1.0
Test accuracy  0.9915348629984406
Test recall 0.9896764252696456
Test AUC 0.9914700028025166
Test f1 score 0.9912037037037037
Test precision score 0.9927357032457497


xgb = XGBClassifier(random_state=101)
xgb.fit(train_results, y_train)

C:\Users\Pce\anaconda3\lib\site-packages\xgboost\sklearn.py:1224: UserWarning: The use of label encoder in XGBClassifier is deprecated and will be removed in a future release. To remove this warning, do the following: 1) Pass option use_label_encoder=False when constructing XGBClassifier object; and 2) Encode your labels (y) as integers starting with 0, i.e. 0, 1, 2, ..., [num_class - 1].
  warnings.warn(label_encoder_deprecation_msg, UserWarning)

[23:53:04] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.5.1/src/learner.cc:1115: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              gamma=0, gpu_id=-1, importance_type=None,
              interaction_constraints='', learning_rate=0.300000012,
              max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=4,
              num_parallel_tree=1, predictor='auto', random_state=101,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)


pred_xgb = rfc.predict(test_results)
print(classification_report(y_test,pred_xgb))
plot_confusion_matrix(xgb,test_results, y_test)

              precision    recall  f1-score   support

           0       0.99      0.99      0.99      6977
           1       0.99      0.99      0.99      6490

    accuracy                           0.99     13467
   macro avg       0.99      0.99      0.99     13467
weighted avg       0.99      0.99      0.99     13467

C:\Users\Pce\anaconda3\lib\site-packages\sklearn\utils\deprecation.py:87: FutureWarning: Function plot_confusion_matrix is deprecated; Function `plot_confusion_matrix` is deprecated in 1.0 and will be removed in 1.2. Use one of the class methods: ConfusionMatrixDisplay.from_predictions or ConfusionMatrixDisplay.from_estimator.
  warnings.warn(msg, category=FutureWarning)

<sklearn.metrics._plot.confusion_matrix.ConfusionMatrixDisplay at 0x270246dc850>


train_score = 0
test_score = 0
test_recall = 0
test_auc = 0

train_score = xgb.score(train_results, y_train)
test_score = xgb.score(test_results, y_test)
y_pred_xgb = xgb.predict(test_results)

test_recall = recall_score(y_test, y_pred_xgb)
xgb_fpr, xgb_tpr, thresholds = roc_curve(y_test, y_pred_xgb)
test_auc = auc(xgb_fpr, xgb_tpr)
test_f1_score = f1_score(y_test,y_pred_xgb)
test_precision_score = precision_score(y_test,y_pred_xgb)


#implement AUC Curve
y_pred_proba = xgb.predict_proba(test_results)[::,1]
fpr, tpr, _ = roc_curve(y_test,  y_pred_proba)

#create ROC curve
plt.plot(fpr,tpr)
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()


print("Train accuracy ", train_score)
print("Test accuracy ", test_score)
print("Test recall", test_recall)
print("Test AUC", test_auc)
print("Test f1 score", test_f1_score)
print("Test precision score", test_precision_score)

Train accuracy  0.9999681741510454
Test accuracy  0.9972525432538799
Test recall 0.9979969183359014
Test AUC 0.997278522232305
Test f1 score 0.9971518743745671
Test precision score 0.9963082602676512


from sklearn.model_selection import GridSearchCV

rfc_clf = RandomForestClassifier(n_jobs=-1, random_state=42) 

param_grid = { 
    'n_estimators': [500, 700, 900],
    'min_samples_split': [2,4,6,8,10]
}

gs = GridSearchCV ( estimator = rfc_clf,
                   param_grid = param_grid,
                   scoring = 'accuracy',
                   cv = 5,
                   refit = True,
                   n_jobs = -1
                   )

gs = gs.fit ( train_results, y_train )

print ( 'Parameter setting that gave the best results on the hold out data:', gs.best_params_ )

print ( 'Mean cross-validated score of the best_estimator: %.3f' %gs.best_score_ )

gs = gs.best_estimator_

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.


KeyboardInterrupt


gs.fit (train_results, y_train)
y_pred_rfc_clf = gs.predict (test_results)


train_score = 0
test_score = 0
test_recall = 0
test_auc = 0

train_score = gs.score(train_results, y_train)
test_score = gs.score(test_results, y_test)
y_pred_rfc_clf = gs.predict(test_results)

test_recall = recall_score(y_test, y_pred_rfc_clf)
rfc_fpr, rfc_tpr, thresholds = roc_curve(y_test, y_pred_rfc_clf)
test_auc = auc(rfc_fpr, rfc_tpr)
test_f1_score = f1_score(y_test,y_pred_rfc_clf)
test_precision_score = precision_score(y_test,y_pred_rfc_clf)

#implement AUC Curve
y_pred_proba = xgb.predict_proba(test_results)[::,1]
fpr, tpr, _ = roc_curve(y_test,  y_pred_proba)

#create ROC curve
plt.plot(fpr,tpr)
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()


print("Train accuracy ", train_score)
print("Test accuracy ", test_score)
print("Test recall", test_recall)
print("Test AUC", test_auc)
print("Test f1 score", test_f1_score)
print("Test precision score", test_precision_score)

	title	text	subject	date
0	Donald Trump Sends Out Embarrassing New Year’...	Donald Trump just couldn t wish all Americans ...	News	December 31, 2017
1	Drunk Bragging Trump Staffer Started Russian ...	House Intelligence Committee Chairman Devin Nu...	News	December 31, 2017
2	Sheriff David Clarke Becomes An Internet Joke...	On Friday, it was revealed that former Milwauk...	News	December 30, 2017
3	Trump Is So Obsessed He Even Has Obama’s Name...	On Christmas day, Donald Trump announced that ...	News	December 29, 2017
4	Pope Francis Just Called Out Donald Trump Dur...	Pope Francis used his annual Christmas Day mes...	News	December 25, 2017

	title	text	subject	date
0	As U.S. budget fight looms, Republicans flip t...	WASHINGTON (Reuters) - The head of a conservat...	politicsNews	December 31, 2017
1	U.S. military to accept transgender recruits o...	WASHINGTON (Reuters) - Transgender people will...	politicsNews	December 29, 2017
2	Senior U.S. Republican senator: 'Let Mr. Muell...	WASHINGTON (Reuters) - The special counsel inv...	politicsNews	December 31, 2017
3	FBI Russia probe helped by Australian diplomat...	WASHINGTON (Reuters) - Trump campaign adviser ...	politicsNews	December 30, 2017
4	Trump wants Postal Service to charge 'much mor...	SEATTLE/WASHINGTON (Reuters) - President Donal...	politicsNews	December 29, 2017

	title	text	subject	date
0	Donald Trump Sends Out Embarrassing New Year’...	Donald Trump just couldn t wish all Americans ...	News	December 31, 2017
1	Drunk Bragging Trump Staffer Started Russian ...	House Intelligence Committee Chairman Devin Nu...	News	December 31, 2017
2	Sheriff David Clarke Becomes An Internet Joke...	On Friday, it was revealed that former Milwauk...	News	December 30, 2017
3	Trump Is So Obsessed He Even Has Obama’s Name...	On Christmas day, Donald Trump announced that ...	News	December 29, 2017
4	Pope Francis Just Called Out Donald Trump Dur...	Pope Francis used his annual Christmas Day mes...	News	December 25, 2017

	title	text	subject	date	target
0	As U.S. budget fight looms, Republicans flip t...	WASHINGTON (Reuters) - The head of a conservat...	politicsNews	December 31, 2017	1
1	U.S. military to accept transgender recruits o...	WASHINGTON (Reuters) - Transgender people will...	politicsNews	December 29, 2017	1
2	Senior U.S. Republican senator: 'Let Mr. Muell...	WASHINGTON (Reuters) - The special counsel inv...	politicsNews	December 31, 2017	1
3	FBI Russia probe helped by Australian diplomat...	WASHINGTON (Reuters) - Trump campaign adviser ...	politicsNews	December 30, 2017	1
4	Trump wants Postal Service to charge 'much mor...	SEATTLE/WASHINGTON (Reuters) - President Donal...	politicsNews	December 29, 2017	1
...	...	...	...	...	...
23476	McPain: John McCain Furious That Iran Treated ...	21st Century Wire says As 21WIRE reported earl...	Middle-east	January 16, 2016	0
23477	JUSTICE? Yahoo Settles E-mail Privacy Class-ac...	21st Century Wire says It s a familiar theme. ...	Middle-east	January 16, 2016	0
23478	Sunnistan: US and Allied ‘Safe Zone’ Plan to T...	Patrick Henningsen 21st Century WireRemember ...	Middle-east	January 15, 2016	0
23479	How to Blow $700 Million: Al Jazeera America F...	21st Century Wire says Al Jazeera America will...	Middle-east	January 14, 2016	0
23480	10 U.S. Navy Sailors Held by Iranian Military ...	21st Century Wire says As 21WIRE predicted in ...	Middle-east	January 12, 2016	0

	title	text	subject	date	target	content
0	As U.S. budget fight looms, Republicans flip t...	WASHINGTON (Reuters) - The head of a conservat...	politicsNews	December 31, 2017	1	As U.S. budget fight looms, Republicans flip t...
1	U.S. military to accept transgender recruits o...	WASHINGTON (Reuters) - Transgender people will...	politicsNews	December 29, 2017	1	U.S. military to accept transgender recruits o...
2	Senior U.S. Republican senator: 'Let Mr. Muell...	WASHINGTON (Reuters) - The special counsel inv...	politicsNews	December 31, 2017	1	Senior U.S. Republican senator: 'Let Mr. Muell...
3	FBI Russia probe helped by Australian diplomat...	WASHINGTON (Reuters) - Trump campaign adviser ...	politicsNews	December 30, 2017	1	FBI Russia probe helped by Australian diplomat...
4	Trump wants Postal Service to charge 'much mor...	SEATTLE/WASHINGTON (Reuters) - President Donal...	politicsNews	December 29, 2017	1	Trump wants Postal Service to charge 'much mor...

Creating a target label for both dataset¶

Combining both dataset¶

Create a new column called content that combines the news.title and news.text together¶

Cleaning the date column¶

Deleting unnecessary columns after combination¶

Checking for missing value¶

WordCloud of Fake and True News¶

Classification of article and target¶

Naive bayes model (Multinomial)¶

RandomForest Model¶

Xgboost¶

Hyperparameter Tuning on Randomforest¶

	subject	date	target	content
0	politicsNews	2017-12-31	1	As U.S. budget fight looms, Republicans flip t...
1	politicsNews	2017-12-29	1	U.S. military to accept transgender recruits o...
2	politicsNews	2017-12-31	1	Senior U.S. Republican senator: 'Let Mr. Muell...
3	politicsNews	2017-12-30	1	FBI Russia probe helped by Australian diplomat...
4	politicsNews	2017-12-29	1	Trump wants Postal Service to charge 'much mor...