") # 2. 训练词向量 all_words = data.get_all_words() # 用所有的数据集来训练词向量,而不是只用训练集。 w2v = Word2Vec([all_words], sg=1, size=vector_size, negative=5, iter=5, window=5) vector = [] for w in all_words: vector.append(w2v.wv[w]) # 3. 将无子集的电影删除。删除后同步更新全部的all_movies、all_words、all_movies_to_clean print("电影大集合的数量", len(all_movies)) all_movies = data.remove_no_subset(all_movies) print("删除子集后的电影大集合的数量", len(all_movies)) for i in range(len(all_movies)): if not all_words[i] in all_movies: all_movies = None all_words = None all_movies_to_clean = None vector = None break # 4. 将影评中的影评词语转化成词向量 review_words = pd.read_csv("stage3Datasets/" predictionId ".csv") review_words = review_words["review_words"].values review_vectors = [] for w in review_words: try: print(w) review_vectors.append(w2v.wv[w]) except: continue # 5. 计算影评中的单词平均向量,即唯一向量 review_sum = 0 for v in review_vectors: review_sum = v review_vector = review_sum/len(review_vectors) # 6. 将所有影片通过平均向量的相似度进行排序 sim = [] for v in vector: sim.append(cosine_similarity(review_vector.reshape(-1, 1), v.reshape(-1, 1))) sim = np.array(sim) print("#"*80) print(sim) print(type(sim)) print("#"*80) print(sim.shape) sim = sim.reshape(-1, 1) target_index = np.argsort(sim, axis=0)[::-1]
《HyperProjection舞“排球少年!!顶端的风景”幕后纪录》由须贺健太 木村达成 等主演,ウォーリー木下 导演的剧情电影...
免責聲明:若本站收錄的資源侵犯了您的權益,我們會及時刪除侵權內容,謝謝合作!