word2vec


# coding=utf-8
# !/usr/bin/python


# 只含有一个隐层

#  数据集合来自CIFAR-10

# ==============================================================================
import collections
import math
import os
import random
import zipfile

import numpy as np

import urllib


import tensorflow as tf

# 下载数据

url = 'http://mattmahoney.net/dc/'

"""
def maybe_download(filename, expected_bytes):
 # Download a file if not present, and make sure it's the right size.
  if not os.path.exists(filename):
    filename, _ = urllib.request.urlretrieve(url + filename, filename)
  statinfo = os.stat(filename)
  if statinfo.st_size == expected_bytes:
    print('Found and verified', filename)
  else:
    print(statinfo.st_size)
    raise Exception('Failed to verify ' + filename + '. Can you get to it with a browser?')
  return filename
"""
#filename = maybe_download('text8.zip', 1253376)

filename='text8.zip'

# 解压文件

def read_data(filename):
  with zipfile.ZipFile(filename) as f:
    data = tf.compat.as_str(f.read(f.namelist()[0])).split()
  return data

words = read_data(filename)
print('Data size', len(words))

 # 创建词汇表

vocabulary_size = 50000

def build_dataset(words):
  count = [['UNK', -1]]
  count.extend(collections.Counter(words).most_common(vocabulary_size - 1))
  dictionary = dict()
  for word, _ in count:
    dictionary[word] = len(dictionary)
  data = list()
  unk_count = 0
  for word in words:
    if word in dictionary:
      index = dictionary[word]
    else:
      index = 0  # dictionary['UNK']
      unk_count += 1
    data.append(index)
  count[0][1] = unk_count
  reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
  return data, count, dictionary, reverse_dictionary

data, count, dictionary, reverse_dictionary = build_dataset(words)


# 删除原始词汇表,节省内存
del words  # Hint to reduce memory.
print('Most common words (+UNK)', count[:5])
print('Sample data', data[:10], [reverse_dictionary[i] for i in data[:10]])

results matching ""

    No results matching ""