MSCI 641 Assignment Total
Assignment 1 (code only, 5%) Due date: June 8
Write a python script to perform the following data preparation activities:
- Tokenize the corpus
- Remove the following special characters: !"#$%&()*+/:;<=>@[\]^`{|}~\t\n
- Create two versions of your dataset: (1) with stopwords and (2) without stopwords. Stopword lists are available online.
- Randomly split your data into training (80%), validation (10%) and test (10%) sets.
- Command-line argument: Path to the folder where pos.txt and neg.txt reside.
- You must implement the required techniques in core Python. For instance, you can use Python’s
module to create the train/val/test split and can read the .txt files in Python (using the in-builtopen
function) instead of usingpandas
. - The tokenized sentences should be stored in a file called ‘out.csv’ where each line follows the format mentioned below. Note that the quotes below are just to show that they are strings, you don’t need to explicitly add them around every token: [‘My’, ‘daughter’, ‘wanted’, ‘this’, ‘book’, ‘and’, ‘the’, ‘price’, ‘on’, ‘Amazon’, ‘was’, ‘the’, ‘best.’]
- Although not required, you can remove any additional punctuation marks such as ‘-’. Additionally, it doesn’t matter if you store those punctuations as a separate token or concatenate it with the previous word.
- Expected output files:
- out.csv: tokenized sentences w/ stopwords
- train.csv: training set w/ stopwords
- val.csv: validation set w/ stopwords
- test.csv: test set w/ stopwords
- out_ns.csv: tokenized sentences w/o stopwords
- train_ns.csv: training set w/o stopwords
- val_ns.csv: validation set w/o stopwords
- test_ns.csv: test set w/o stopwords
- It is recommended that you generate the labels as well in this assignment even though it is not mandatory
neg.txt 的部分数据和格式如下:
I bought this when I bought the pop maker.
As for "pop embellishing" well, that wasn't too hard to figure out, either.
I'd save the money and spend it instead on extra pop sticks, which seem to disappear the way socks do.
didn't really care many of the cakes at all.
not up to normal standing for wilton yearbooks of the past.
Buy a Wilton magazine for less money and get more ideas and instructions for your investment.
Bag tore with almost nothing in it - Just caught the corner of a small cracker box and that was that.
My daughter wanted this book and the price on Amazon was the best.
She has already tried one recipe a day after receiving the book.
I bought this zoku quick pop for my daughterr with her zoku quick maker.
She loves it and have fun to make her own ice cream.
I was hoping there were more where those came from.
This book emphasizes very sweet dessert pops, however.
1. Solution
import os
import csv
import argparse
import string
import random
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
def remove_special_chars(tokens, special_chars):
return [token for token in tokens if token not in special_chars]
def remove_stopwords(tokens, stopword_list):
return [token for token in tokens if token not in stopword_list]
def process_text(input_path, output_path, output_ns_path, stopword_list, label):
with open(input_path, 'r') as input_file, \
open(output_path, 'a', newline='') as output_file, \
open(output_ns_path, 'a', newline='') as output_ns_file:
csv_writer = csv.writer(output_file)
csv_writer_ns = csv.writer(output_ns_file)
for line in input_file:
line = line.strip()
tokens = word_tokenize(line)
tokens = remove_special_chars(tokens, string.punctuation)
csv_writer.writerow([label, ' '.join(tokens)])
tokens_no_stopwords = remove_stopwords(tokens, stopword_list)
csv_writer_ns.writerow([label, ' '.join(tokens_no_stopwords)])
def split_data(input_path, train_path, val_path, test_path, split_ratio=(0.8, 0.1, 0.1)):
with open(input_path, 'r') as input_file:
lines = list(csv.reader(input_file))
train_size = int(len(lines) * split_ratio[0])
val_size = int(len(lines) * split_ratio[1])
train_lines = lines[:train_size]
val_lines = lines[train_size:train_size+val_size]
test_lines = lines[train_size+val_size:]
with open(train_path, 'w', newline='') as train_file, \
open(val_path, 'w', newline='') as val_file, \
open(test_path, 'w', newline='') as test_file:
train_writer = csv.writer(train_file)
val_writer = csv.writer(val_file)
test_writer = csv.writer(test_file)
def main():
parser = argparse.ArgumentParser(description='Prepare data')
parser.add_argument('--folder_path', type=str, help='Path to the folder where pos.txt and neg.txt reside.')
args = parser.parse_args()
stopword_list = stopwords.words('english')
special_chars = string.punctuation + "\t\n"
# Prepare out.csv and out_ns.csv with headers
with open(os.path.join(args.folder_path, 'out.csv'), 'w', newline='') as file, \
open(os.path.join(args.folder_path, 'out_ns.csv'), 'w', newline='') as file_ns:
writer = csv.writer(file)
writer_ns = csv.writer(file_ns)
writer.writerow(['labels', 'text'])
writer_ns.writerow(['labels', 'text'])
for sentiment in ['pos', 'neg']:
input_path = os.path.join(args.folder_path, f'{sentiment}.txt')
output_path = os.path.join(args.folder_path, 'out.csv')
output_ns_path = os.path.join(args.folder_path, 'out_ns.csv')
process_text(input_path, output_path, output_ns_path, stopword_list, sentiment)
for filename in ['out', 'out_ns']:
input_path = os.path.join(args.folder_path, f'{filename}.csv')
train_path = os.path.join(args.folder_path, f'{filename}_train.csv')
val_path = os.path.join(args.folder_path, f'{filename}_val.csv')
test_path = os.path.join(args.folder_path, f'{filename}_test.csv')
split_data(input_path, train_path, val_path, test_path)
if __name__ == "__main__":
import os # 操作系统功能的模块,用于文件和目录操作
import csv # 用于CSV文件读写的模块
import argparse # 用于处理命令行参数的模块
import string # 包含常用字符串常量和操作的模块
import random # 用于生成随机数的模块
from nltk.corpus import stopwords # 从nltk库导入停用词列表
from nltk.tokenize import word_tokenize # 从nltk库导入单词分词函数
# 函数:删除特殊字符
def remove_special_chars(tokens, special_chars):
# 列表解析:遍历tokens,只保留不在special_chars中的token
return [token for token in tokens if token not in special_chars]
# 函数:删除停用词
def remove_stopwords(tokens, stopword_list):
# 列表解析:遍历tokens,只保留不在stopword_list中的token
return [token for token in tokens if token not in stopword_list]
# 函数:处理文本,包括分词、删除特殊字符,并将结果保存到两个CSV文件中
def process_text(input_path, output_path, output_ns_path, stopword_list, label):
# 打开输入文件和两个输出文件
with open(input_path, 'r') as input_file, \
open(output_path, 'a', newline='') as output_file, \
open(output_ns_path, 'a', newline='') as output_ns_file:
csv_writer = csv.writer(output_file) # 创建CSV写入器
csv_writer_ns = csv.writer(output_ns_file) # 创建另一个CSV写入器
for line in input_file: # 遍历输入文件的每一行
line = line.strip() # 删除行尾的换行符
tokens = word_tokenize(line) # 分词
tokens = remove_special_chars(tokens, string.punctuation) # 删除特殊字符
csv_writer.writerow([label, ' '.join(tokens)]) # 写入一行到第一个输出文件
tokens_no_stopwords = remove_stopwords(tokens, stopword_list) # 删除停用词
csv_writer_ns.writerow([label, ' '.join(tokens_no_stopwords)]) # 写入一行到第二个输出文件
# 函数:将数据分割为训练集、验证集和测试集
def split_data(input_path, train_path, val_path, test_path, split_ratio=(0.8, 0.1, 0.1)):
with open(input_path, 'r') as input_file: # 打开输入文件
lines = list(csv.reader(input_file)) # 读取所有行到一个列表中
random.shuffle(lines) # 随机打乱这个列表
train_size = int(len(lines) * split_ratio[0]) # 计算训练集大小
val_size = int(len(lines) * split_ratio[1]) # 计算验证集大小
train_lines = lines[:train_size] # 取出训练集数据
val_lines = lines[train_size:train_size+val_size] # 取出验证集数据
test_lines = lines[train_size+val_size:] # 取出测试集数据
# 打开训练集、验证集和测试集文件
with open(train_path, 'w', newline='') as train_file, \
open(val_path, 'w', newline='') as val_file, \
open(test_path, 'w', newline='') as test_file:
train_writer = csv.writer(train_file) # 创建CSV写入器
val_writer = csv.writer(val_file) # 创建另一个CSV写入器
test_writer = csv.writer(test_file) # 创建另一个CSV写入器
train_writer.writerows(train_lines) # 将训练集数据写入训练集文件
val_writer.writerows(val_lines) # 将验证集数据写入验证集文件
test_writer.writerows(test_lines) # 将测试集数据写入测试集文件
# 主函数
def main():
parser = argparse.ArgumentParser(description='Prepare data') # 创建参数解析器
parser.add_argument('--folder_path', type=str, help='Path to the folder where pos.txt and neg.txt reside.') # 添加一个参数
args = parser.parse_args() # 解析命令行参数
stopword_list = stopwords.words('english') # 获取英文停用词列表
special_chars = string.punctuation + "\t\n" # 特殊字符列表
# 在两个输出文件的开头写入列名
with open(os.path.join(args.folder_path, 'out.csv'), 'w', newline='') as file, \
open(os.path.join(args.folder_path, 'out_ns.csv'), 'w', newline='') as file_ns:
writer = csv.writer(file) # 创建CSV写入器
writer_ns = csv.writer(file_ns) # 创建另一个CSV写入器
writer.writerow(['labels', 'text']) # 写入列名
writer_ns.writerow(['labels', 'text']) # 写入列名
for sentiment in ['pos', 'neg']: # 遍历两种情感(积极和消极)
input_path = os.path.join(args.folder_path, f'{sentiment}.txt') # 输入文件路径
output_path = os.path.join(args.folder_path, 'out.csv') # 输出文件路径
output_ns_path = os.path.join(args.folder_path, 'out_ns.csv') # 另一个输出文件路径
process_text(input_path, output_path, output_ns_path, stopword_list, sentiment) # 处理文本并保存结果
for filename in ['out', 'out_ns']: # 遍历两种文件名
input_path = os.path.join(args.folder_path, f'{filename}.csv') # 输入文件路径
train_path = os.path.join(args.folder_path, f'{filename}_train.csv') # 训练集文件路径
val_path = os.path.join(args.folder_path, f'{filename}_val.csv') # 验证集文件路径
test_path = os.path.join(args.folder_path, f'{filename}_test.csv') # 测试集文件路径
split_data(input_path, train_path, val_path, test_path) # 分割数据并保存结果
if __name__ == "__main__":
main() # 脚本直接运行时调用主函数
import argparse
import gensim
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string
def read_corpus(pos_path, neg_path):
stop_words = set(stopwords.words('english'))
def tokenize_and_remove_stopwords(file_path):
with open(file_path, 'r', encoding='utf-8') as file:
for line in file:
tokens = word_tokenize(line)
yield [token.lower() for token in tokens if token not in string.punctuation and token.lower() not in stop_words]
return list(tokenize_and_remove_stopwords(pos_path)) + list(tokenize_and_remove_stopwords(neg_path))
def main():
parser = argparse.ArgumentParser(description='Train Word2Vec on Amazon Corpus')
parser.add_argument('--folder_path', type=str, help='Path to the folder where pos.txt and neg.txt reside.')
args = parser.parse_args()
corpus = read_corpus(args.folder_path + "/pos.txt", args.folder_path + "/neg.txt")
model = gensim.models.Word2Vec(sentences=corpus, vector_size=100, window=5, min_count=1, workers=4) + "/w2v.model")
if __name__ == "__main__":
2. 对文本进行预处理,包括分词、转小写、删除停用词和标点符号。
3. 使用处理过的语料库训练 Word2Vec 模型。
4. 保存训练好的模型到磁盘。
import argparse
from gensim.models import Word2Vec
def main():
parser = argparse.ArgumentParser()
parser.add_argument('--model_path', type=str, required=True, help="Path to the Word2Vec model")
parser.add_argument('--word_list', type=str, required=True, help="Path to the text file containing words")
args = parser.parse_args()
# Load the model
model = Word2Vec.load(args.model_path)
# Read the word list
with open(args.word_list, 'r') as f:
words =
# Find and print the most similar words
for word in words:
print(f"Words most similar to {word}:")
for sim_word, similarity in model.wv.most_similar(word, topn=20):
print(f"{sim_word}: {similarity}")
except KeyError:
print(f"Word '{word}' not present in vocabulary")
if __name__ == "__main__":
(PythonCoder) ➜ A3 git:(main) ✗ python --model_path w2v.model --word_list text.txt
Words most similar to good:
great: 0.8061188459396362
decent: 0.7986379265785217
excellent: 0.7242569923400879
amazing: 0.7197015285491943
fantastic: 0.6961787343025208
awesome: 0.6774498820304871
nice: 0.6725153923034668
wonderful: 0.6683364510536194
terrific: 0.6608123183250427
bad: 0.6307231783866882
impressive: 0.6158660054206848
okay: 0.6112014055252075
ok: 0.602691113948822
superb: 0.6022987961769104
perfect: 0.5812379717826843
outstanding: 0.5731497406959534
fabulous: 0.5547349452972412
deceving: 0.5541608333587646
terrible: 0.5499886274337769
reasonable: 0.5492241382598877
Words most similar to bad:
terrible: 0.6677519679069519
horrible: 0.6475969552993774
awful: 0.6439879536628723
good: 0.6307231783866882
funny: 0.5975106954574585
poor: 0.5752034187316895
stupid: 0.5607192516326904
negative: 0.5574992299079895
good/consistent: 0.5559390783309937
reckless: 0.5521181225776672
weird: 0.5487945675849915
strange: 0.5457977056503296
wrong: 0.5342735648155212
nasty: 0.5324505567550659
silly: 0.531266450881958
lousy: 0.505230188369751
me.want: 0.5046393871307373
dumb: 0.5015211701393127
fake: 0.49868670105934143
crappy: 0.49515506625175476
Words most similar to book:
booklet: 0.7688108086585999
instruction: 0.7529064416885376
cookbook: 0.7390196323394775
recipe: 0.6936575770378113
instruction/recipe: 0.6650904417037964
guide: 0.6624084711074829
recipes: 0.6565869450569153
info: 0.6523651480674744
cookbooks: 0.6408775448799133
instructional: 0.6347687840461731
books: 0.6298976540565491
manual: 0.6288154721260071
decorating: 0.6277108192443848
youtube: 0.6230810880661011
recipies: 0.622266948223114
promotional: 0.6137821078300476
article: 0.6116392612457275
page: 0.6100519895553589
english: 0.6063416600227356
blurb: 0.6043961644172668
Words most similar to Amazon:
Word 'Amazon' not present in vocabulary
Words most similar to money:
money.i: 0.8063625693321228
cash: 0.7817959189414978
money-: 0.7630889415740967
time/money: 0.700556755065918
money.bottom: 0.6912680268287659
subscribe: 0.6883657574653625
reset.these: 0.6824569702148438
money.edit/update: 0.6633751392364502
technology.i: 0.6604342460632324
moneywish: 0.6510989665985107
money.they: 0.643433153629303
~what: 0.6417416334152222
1-866: 0.634207546710968
moneywill: 0.62284255027771
money.if: 0.621588945388794
money.why: 0.6138383150100708
shipper.yes: 0.6133355498313904
trouble/: 0.6110703349113464
moneys: 0.6091706156730652
bag-it: 0.6089490056037903
Assignment 4 (code + short report, 10%). Due date: July 3
- In addition to the general instructions posted at the beginning of the document, you should also submit an
import argparse
import gensim
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string
def read_corpus(pos_path, neg_path):
stop_words = set(stopwords.words('english'))
def tokenize_and_remove_stopwords(file_path):
with open(file_path, 'r', encoding='utf-8') as file:
for line in file:
tokens = word_tokenize(line)
yield [token.lower() for token in tokens if
token not in string.punctuation and token.lower() not in stop_words]
return list(tokenize_and_remove_stopwords(pos_path)) + list(tokenize_and_remove_stopwords(neg_path))
def main():
parser = argparse.ArgumentParser(description='Train Word2Vec on Amazon Corpus')
parser.add_argument('--folder_path', type=str, help='Path to the folder where pos.txt and neg.txt reside.')
args = parser.parse_args()
corpus = read_corpus(args.folder_path + "/pos.txt", args.folder_path + "/neg.txt")
model = gensim.models.Word2Vec(sentences=corpus, vector_size=100, window=5, min_count=1, workers=4) + "/w2v.model")
if __name__ == "__main__":
必须使用 HW1 中输出的训练、验证和测试集来训练模型,只能用训练集进行训练,验证集进行调试,并用测试集来比对准确率, 可以使用带 stopword 或不带 stopword 的集。(即带_ns和不带_ns的)哪个给出的准确率更高用哪个
其中_ns后缀代表内容已经移除 stopword。
注意1:我所有的数据都有 label和text标签
注意2:不要假设已经将数据加载到了 pandas DataFrame 中,并且 DataFrame 的列为 'label' 和 'text'。切记我并没有提前完成,我其他的也是,全部没有提前完成。
train_ns.csv 部分数据如下:
neg,This item worked first day 's
neg,You probably enjoy reading book much playing game
pos,This reason I gave 4 instead 5 stars
pos,I dont actually use bake cookies fondant cutters work great
pos,I super excited get start using
pos,My husband mechanic dropped case prevents damage phone
neg,What n't care flavor ... lack thereof
train.csv 部分数据如下:
pos,The con price I still think well worth money
neg,The case slack phone well making good fit impossible
neg,It took get two tubes suspiciously resemble couple little dildos box
neg,That drive three would recognize tracks therefore would recognize disc
neg,Soon spray 've got color The fumes arent harsh still kinda stinks
pos,I needed small tongs serving carrot celery sticks dip
val_ns.csv 部分数据如下:
pos,The inside soft material protect screen
neg,The first one I purchased year ago made differently item sold
neg,In general son enjoy much
neg,Have already said game easy First time playing ranking A
neg,In order work I put bake broil burns outside cook inside
pos,worth wait wo n't recharge
val.csv 部分数据如下:
test_ns.csv 部分数据如下:
neg,I had to take an antihistamine to calm the itching
neg,The bottle is lovely and very girlie I love the amethyst jewel tone of purple
neg,When I got my new computer I was not at all happy with the USB keyboard that came with it
neg,I will get this brand of Halvah.has too many artificial ingredients
neg,It lingers forever long after you 've finished your last sip and it 's not very pleasant
neg,This stuff was so fine it went right through my strainer when I tried to minimize the particles
neg,I bought this cable and it does n't work
test.csv 部分数据如下:
pos,It covers all the buttons power camera volume/zoom and menu .4
neg,If you do n't know what your doing it wo n't go on right
neg,Now I have two giant cans that I probably wo n't drink
neg,You will be flipping left and right going up and down with no sence of direction
1. 添加列名
使用 Python 的 pandas 库来执行此操作。以下是一种可能的方法:
import pandas as pd
# 读取csv数据
df = pd.read_csv('your_file.csv', header=None)
# 给第一列和第二列添加标签
df.columns = ['label', 'text']
# 写回到csv文件
df.to_csv('new_file.csv', index=False)
# -*- coding: utf-8 -*-
# @Time : 2023/7/3 21:02
# @Author : AI悦创
# @FileName:
# @Software: PyCharm
# @Blog :
import pandas as pd
import os
# 路径生成
path = "./data"
csv_lst = []
for dirpath, dirnames, filenames in os.walk(path):
for fn in filenames:
csv_lst.append(os.path.join(path, fn))
# 读取csv数据
for filename in csv_lst:
df = pd.read_csv(filename, header=None)
# 给第一列和第二列添加标签
df.columns = ['label', 'text']
# 写回到 csv 文件
df.to_csv(filename, index=False)
在这段代码中,我们首先使用 pandas 的 read_csv
函数读取 CSV 文件。然后我们设置列标签为 'label'
和 'text'
函数将带有新标签的 DataFrame 写回CSV文件。请将 'your_file.csv'
和 'new_file.csv'
这段代码默认您的 CSV 文件中的数据没有头部,且每一列以逗号分隔。如果您的数据格式不同,请根据实际情况调整 read_csv
另外,请注意,这段代码将直接覆盖 'new_file.csv'
max_len = max([len(x) for x in train_sequences])
# 计算序列长度的列表
seq_lengths = [len(x) for x in train_sequences]
# 计算平均长度
avg_len = int(np.mean(seq_lengths))
# 计算中位数长度
median_len = int(np.median(seq_lengths))
# 根据平均长度填充序列
train_data_avg = pad_sequences(train_sequences, maxlen=avg_len)
val_data_avg = pad_sequences(val_sequences, maxlen=avg_len)
test_data_avg = pad_sequences(test_sequences, maxlen=avg_len)
# 根据中位数长度填充序列
train_data_med = pad_sequences(train_sequences, maxlen=median_len)
val_data_med = pad_sequences(val_sequences, maxlen=median_len)
test_data_med = pad_sequences(test_sequences, maxlen=median_len)
pip install -q keras==2.13.1 numpy==1.25.0 pandas==2.0.3 gensim==4.3.1 tensorflow==2.13.0-rc2

